In [2]:
# import packages

import os
import spacy
import re
from bs4 import BeautifulSoup as bs
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np


In [3]:
# loading spacy model
nlp = spacy.load("en_core_web_sm")

# defining a function to extract only nouns and adjectives
def preprocess_text(text):
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'ADJ']]
    return " ".join(processed_tokens)  # returning processed texts

# apply function ^ to filepath
filepath = "/Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei"
metadata_list = []
text_list = []
raw_text_list = []

for entry in os.scandir(filepath):
    if entry.name.startswith("."):  # identified an issue with a hidden file so this prevents issues w/ utf8 encoding stopping the process
        continue
    print(f"Processing: {entry.path}")  # for tracking during the process
    try:
        with open(entry.path, encoding="utf-8") as file:
            xml_content = file.read()
        # parsing file w/ beautiful soup to extract metadata
        soup = bs(xml_content, "xml")
        author = soup.author.text.strip() if soup.author else "Unknown"
        title = soup.title.text.strip() if soup.title else "Untitled"
        pub_date = soup.date.text.strip() if soup.date else "Unknown"
        in_out = soup.affiliation.text.strip() if soup.affiliation else "Unknown"
        # storing the metadata
        metadata = {
            "author": author,
            "title": title,
            "pub_date": pub_date,
            "insider/outsider": in_out,
        }
        metadata_list.append(metadata)

        # preprocessing and extracting the text
        text = soup.body.text if soup.body else ""
        raw_text_list.append(text)
        processed_text = preprocess_text(text)  
        text_list.append(processed_text)
    except UnicodeDecodeError:  # again, found an error that was preventing the processing of texts with a hidden file - this is to keep the process moving
        print(f"UnicodeDecodeError in file: {entry.path}")
        continue  
print("Processing complete!")


Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/marianas_mosiac.xml
Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/guam_two_invasions_and_three_military_occupations.xml
Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/legacy_of_a_political_union.xml
Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/eng_chamoru_legends.xml
Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/destinys_landfall.xml
Processing: /Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei/history_of_the_chamorro_people.xml
Processing complete!


In [None]:
# METHOD 1 : TF-IDF

from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
C = vectorizer.fit_transform(text_list)
feature_names = vectorizer.get_feature_names_out()

# converting countvectorizer to TD-IDF
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(C)

# convert dataframe
titles = [metadata['title'] for metadata in metadata_list]
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=feature_names, index=titles)

# display top words per document
import numpy as np
top_n = 5  # adjust number of top words to show
for title in titles:
    print(f"Top words for '{title}':")
    top_words = df_tfidf.loc[title].nlargest(top_n)
    print(top_words, "\n")


Top words for 'A Marianas Mosaic: Signs and Shifts in Contemporary Island Life':
generation    0.259778
other         0.216253
people        0.212301
culture       0.203745
island        0.180117
Name: A Marianas Mosaic: Signs and Shifts in Contemporary Island Life, dtype: float64 

Top words for 'GUAM: TWO INVASIONS AND THREE MILITARY OCCUPATIONS':
japanese     0.430121
naval        0.224926
gun          0.202889
island       0.197352
guamanian    0.186958
Name: GUAM: TWO INVASIONS AND THREE MILITARY OCCUPATIONS, dtype: float64 

Top words for 'Legacy of a Political Union: A Founding Father's Memoir':
negotiation    0.353061
people         0.331279
political      0.221373
member         0.191123
citizen        0.158975
Name: Legacy of a Political Union: A Founding Father's Memoir, dtype: float64 

Top words for 'Chamoru Legends: A Gathering of Stories':
tree       0.292939
brother    0.204505
child      0.190687
man        0.165218
time       0.155640
Name: Chamoru Legends: A Gatherin

**NOTES ON VECTORIZING:**
- Vectorizing is the process of converting raw text data into numerical representations (vectors) that machine learning algorithms can understand and process. Since algorithms work with numbers, not words, this step is essential for tasks like text classification, clustering, or analysis.
- CountVectorizer (Bag-Of-Words) counts how many times each word appears in a document
- TF-IDF reflects term importance (see below)


**NOTES ON TF-IDF:**
- Converting from CountVectorizer to TF-IDF involves transforming raw word counts into a weighted representation that reflects term importance across documents where
- CountVectorizer: outputs a term-frequency matrix (e.g. "The cat sat on the mat" > {'the':2, 'cat':1, 'sat':1, ...})
- TF-IDF adjusts these two counts using Term Frequency (how often a word appears) and Inverse Document Frequency (penalizes terms that may appear in many documents)
- Resulting in a boost in rare and meaninful terms and a downweight of common terms 

In [15]:
# METHOD 1 (continued) : Co-occurrences

corpus = raw_text_list
target_noun = "guam"
co_occurrences = []

for doc in corpus:
    spacy_doc = nlp(doc)
    for token in spacy_doc:
        if token.text.lower() == target_noun:
            for child in token.children:
                if child.pos_ == "ADJ":
                    print(f"Adjective near '{target_noun}': {token.head.text}")
                    co_occurrences.append(child.text)
            if token.head.pos_ == "ADJ":
                print(f"Adjective governing '{target_noun}': {token.head.text}")
                co_occurrences.append(token.head.text)

print("Co-occurrences:", co_occurrences)

Adjective governing 'guam': past
Adjective near 'guam': in
Adjective near 'guam': saw
Adjective near 'guam': in
Adjective near 'guam': neutralize
Adjective near 'guam': be
Adjective near 'guam': in
Adjective near 'guam': confine
Adjective near 'guam': in
Adjective near 'guam': in
Adjective near 'guam': in
Adjective near 'guam': from
Adjective near 'guam': people
Adjective near 'guam': scouted
Adjective near 'guam': in
Adjective near 'guam': in
Adjective near 'guam': settlements
Co-occurrences: ['past', 'modern', 'postwar', 'modern', 'strong', 'fortified', 'vulnerable', 'defenseless', 'roadless', 'southern', 'central', 'northern', 'Northern', 'southern', 'northern', 'northern', 'central']


In [None]:
# METHOD 1 (continued) : Sentiment Analysis
from textblob import TextBlob

# create dataframe
df_corpus = pd.DataFrame(metadata_list)  # create dataframe

df_corpus["sentiment"] = [TextBlob(text).sentiment.polarity for text in text_list]
print(df_corpus[["title", "sentiment"]])  # title for context

                                               title  sentiment
0  A Marianas Mosaic: Signs and Shifts in Contemp...   0.087177
1  GUAM: TWO INVASIONS AND THREE MILITARY OCCUPAT...   0.026302
2  Legacy of a Political Union: A Founding Father...   0.112685
3            Chamoru Legends: A Gathering of Stories   0.097129
4              Destiny’s Landfall: A History of Guam   0.038031
5  The Hale'-Ta Series: HeStorian Taotao Tano': H...   0.100835


Notes on Sentiment Analysis:
- Sentiment creates a spectrum of sentiment based on negative (-), neutral (0), and positive (+) words throughout the text. In this case, most of the texts are neutral to netural-positive which could tell us something based on the nature / purpose of the texts being written. 

In [None]:
print(f"Extracted {len(texts)} documents")
for i, text in enumerate(texts[:3]):  # print first 3 documents
    print(f"Document {i}: {text[:500]}...\n")


Extracted 0 documents


In [None]:
import os
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

filepath = "/Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei"

texts = []
labels = []

# iterate over all files
for file in os.scandir(filepath):
    if file.name.startswith('.') or not file.is_file():  # skip hidden
        continue

    label = "tei_text"  

    try:
        with open(file.path, encoding="utf8") as input_file:
            xml_content = input_file.read()
        soup = bs(xml_content, "xml")
        text = soup.body.get_text() if soup.body else soup.get_text()  

        if text.strip():  # make sure not empty
            texts.append(text)
            labels.append(label)
        else:
            print(f"Skipping empty document: {file.path}")
    except UnicodeDecodeError:
        print(f'Skipping non-text file: {file.path}')

# debug: print collected texts
print("Number of documents:", len(texts))
for i, text in enumerate(texts[:3]):  # first 3 texts
    print(f"\nDocument {i + 1}:\n{text[:500]}")  # first 500 characters

# vectorize
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words="english")  
vectorized_texts = vectorizer.fit_transform(texts)

# train classifier
classifier = MultinomialNB()
classifier.fit(vectorized_texts, labels)

Number of documents: 6

Document 1:


Introduction
project supported in great measure by a generous grant from the Northern Marianas
                    Humanities Council, along with matching funds from the Northern Marianas
                    College (NMC). The book is a collection of essays, articles, and narratives that ex-
                    plores some of the many topics relevant to contemporary life in the Marianas.
As we embarked on this project, we had two primary goals. One was to meaningfully
                    contr

Document 2:


INTRODUCTION START POINT
The purpose of the War in the Pacific National Historical Park is “to commemorate the bravery and sacrifice of those participating in the campaigns of the Pacific theater of World War 11... 4 Army, Navy and support personnel of Japan and Korea fought against Army, Navy and support personnel of allied Western nations ~- United States, Great Britain, Netherlands, China, New Zealand and Australia. Scenes of the fighting, o

In [None]:
# import os

# for entry in os.scandir(filepath):
    # print(entry.name, "->", "Folder" if entry.is_dir() else "File")


.DS_Store -> File
marianas_mosiac.xml -> File
guam_two_invasions_and_three_military_occupations.xml -> File
legacy_of_a_political_union.xml -> File
eng_chamoru_legends.xml -> File
destinys_landfall.xml -> File
history_of_the_chamorro_people.xml -> File


In [None]:
# print("Number of texts:", len(texts))
# print("Number of labels:", len(labels))
# print("Labels:", labels)  # Should contain meaningful values


Number of texts: 6
Number of labels: 0
Labels: []


In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
from joblib import dump
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import gc  # Import garbage collector

# Reset Model and Vectorizer
# if 'lda' in locals():
    # del lda  # Delete LDA model
# if 'vectorizer' in locals():
    # del vectorizer  # Delete vectorizer

gc.collect()  # force garbage collection

print("Previous model and vectorizer deleted. Resetting...")

# load tei files
folder_path = "/Users/ricky/digital_texts/corpus/files/0_tei_files/finalized_tei"

texts = []
filenames = []

for file in os.scandir(folder_path):
    if file.name.startswith('.') or not file.is_file():
        continue  # Skip hidden files or directories
    
    try:
        with open(file.path, encoding="utf8") as f:
            xml_content = f.read()
        
        # Parse XML and extract <body> text
        soup = bs(xml_content, "xml")
        text = soup.body.get_text() if soup.body else soup.get_text()
        
        if text.strip():  # Only keep non-empty text
            texts.append(text)
            filenames.append(file.name)
        else:
            print(f"Skipping empty document: {file.path}")
    
    except UnicodeDecodeError:
        print(f"Skipping unreadable file: {file.path}")

# convert to dataframe
df = pd.DataFrame({"filename": filenames, "text": texts})
print("Done extracting text from files.")


# vectorize
print("Vectorizing texts...", end=" ", flush=True)
vectorizer = CountVectorizer(min_df=0.01, max_df=0.6, stop_words="english")
vectorized_data = vectorizer.fit_transform(df.text)
print("Done.")

# train lda model
print("Building LDA model using training set...", end=" ", flush=True)
n_topics = 4 # adjust as needed
lda = LatentDirichletAllocation(n_components=n_topics, learning_decay=0.8, random_state=1)
doc_topic_distrib = lda.fit_transform(vectorized_data)
print("Done.")

# display topics
print("Top words per topic:")
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# assign topics to documents
df["topic"] = doc_topic_distrib.argmax(axis=1)

# save model & vector
dump(lda, "lda_model.joblib")
dump(vectorizer, "vectorizer.joblib")

print("\n Model and vectorizer saved.")


Previous model and vectorizer deleted. Resetting...
Done extracting text from files.
Vectorizing texts... Done.
Building LDA model using training set... Done.
Top words per topic:
Topic 1: guamanians, japan, naval, navy, army, pp, guns, lvt, landing, invasion
Topic 2: ancient, vitores, spaniards, maga, missionaries, society, lahi, padre, ancestors, think
Topic 3: ko, hilitai, elena, nåna, sirena, carabao, cow, skin, fruit, maybe
Topic 4: chamoru, generation, cnmi, halo, filipino, chamorus, healers, refaluwasch, 2017, art

 Model and vectorizer saved.


In [None]:
from joblib import dump

dump(vectorizer, "tf_vectorizer.joblib")
dump(nmf, "nmf_model.joblib")

**NOTES ON TOPIC MODELING:**<br><br>
When to INCREASE n_topics (e.g., from 6 to 8 or 10):
- If topics seem too broad or contain mixed themes in one topic.
- If words from different subjects are appearing together in a single topic.
- If you suspect there are more distinct themes in your documents.<br><br>
When to DECREASE n_topics (e.g., from 6 to 4 or 5):
- If topics seem too fragmented, with very specific themes that might not be useful.
- If some topics repeat similar themes with slight variations.
- If you get many topics that don’t seem meaningful.

In [15]:
# COSINE SIMILARITY

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X)
df_similarity = pd.DataFrame(cosine_sim, index=titles, columns=titles)

# Show similarity between first two texts
print(df_similarity.iloc[:2, :2])



                                                    A Marianas Mosaic: Signs and Shifts in Contemporary Island Life  \
A Marianas Mosaic: Signs and Shifts in Contempo...                                           1.000000                 
GUAM: TWO INVASIONS AND THREE MILITARY OCCUPATIONS                                           0.417471                 

                                                    GUAM: TWO INVASIONS AND THREE MILITARY OCCUPATIONS  
A Marianas Mosaic: Signs and Shifts in Contempo...                                           0.417471   
GUAM: TWO INVASIONS AND THREE MILITARY OCCUPATIONS                                           1.000000   
