In [2]:
import fitz
def pdf_extract(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text
china = pdf_extract('china/China_7.pdf') + " " + pdf_extract('china/china_8.pdf') + " " + pdf_extract('china/china_9.pdf')
with open('vietnam/Viet_7.txt','r')as file:
    viet_7 = file.read()
with open('vietnam/Viet_8.txt', 'r', encoding='utf-8', errors='ignore') as file:
    viet_8 = file.read()
viet = viet_7 +" "+ viet_8
japn = pdf_extract('japan/japan_7.pdf')+" "+pdf_extract('japan/japan_8.pdf')+" "+pdf_extract('japan/japan_9.pdf')

In [3]:
textbooks = [china, viet, japn]

# Data Cleaning

In [4]:
import re
from langdetect import detect
from langdetect import LangDetectException
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
def clean_text(post):
    if detect(post) != 'en':
        return ''
    # Make posts lowercase
    post = post.lower()

    # Remove punctuation
    post = re.sub(r'[^a-zA-Z\s]', ' ', post)
    
    # Remove words with repeated letters
    #post = re.sub(r'([a-zA-Z])\1+', r'\1', post)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(post)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

In [8]:
cleanTextbooks = []
for i in textbooks:
    cleanTextbooks.append(clean_text(i))

In [9]:
#These have spaces, we use these to select the male and female words
menWords = [" man ", " boy ", " male ", " brother ", " father ", " son ", " husband ", " king ", " prince ", " uncle ", " nephew ", " he ", " him ", " his ", " gentleman ", " sir ", " mr. ", " hero ", " lord ", " patriarch ", " men "]
womenWords = [" woman ", " girl ", " female ", " sister ", " mother ", " daughter ", " wife ", " queen ", " princess ", " aunt ", " niece ", " she ", " her ", " hers ", " lady ", " ma'am "," madam ", " mrs. ", " ms. ", " miss ", " heroine ", " dame ", " matriarch ", " women "]
#These have no spaces, so these can be used to throw away words during TFIDF
menWordsNoSpace = ["man", "boy", "male", "brother", "father", "son", "husband", "king", "prince", "uncle", "nephew", "he", "him", "his", "gentleman", "sir", "mr", "hero", "lord", "patriarch", "men"]
womenWordsNoSpace = ["woman", "girl", "female", "sister", "mother", "daughter", "wife", "queen", "princess", "aunt", "niece", "she", "her", "hers", "lady", "ma'am","madam", "mrs", "heroine", "dame", "matriarch", "women"]

In [12]:
menPara = [[] for _ in range(3)]
womenPara = [[] for _ in range(3)]
for i in range(0, 3):
    text = cleanTextbooks[i]
    paragraphs = [text[j:j+100] for j in range(0, len(text), 100)]
    for paragraph in paragraphs:
        for j in menWords:
            if j in paragraph:
                menPara[i].append(paragraph)
        for j in womenWords:
            if j in paragraph:
                womenPara[i].append(paragraph)

# Occurence

In [13]:
men_count = sum(len(sublist) for sublist in menPara)
print(men_count)
women_count = sum(len(sublist) for sublist in womenPara)
print(women_count)

352
307


# Firstness

In [14]:
femaleFirst = []
for sublist in womenPara:  
    for paragraph in sublist:  
        if isinstance(paragraph, str):  
            words = paragraph.split()  
            for i in range(len(words) - 1): 
                if words[i].lower() in womenWordsNoSpace and words[i + 1].lower() in menWordsNoSpace:
                    femaleFirst.append((words[i], words[i + 1]))
print(len(femaleFirst),femaleFirst)

7 [('girl', 'father'), ('mother', 'boy'), ('mother', 'son'), ('niece', 'nephew'), ('niece', 'nephew'), ('woman', 'men'), ('girl', 'man')]


In [15]:
maleFirst = []
for sublist in menPara:  
    for paragraph in sublist:  
        if isinstance(paragraph, str):  
            words = paragraph.split()  
            for i in range(len(words) - 1):  
                if words[i].lower() in menWordsNoSpace and words[i + 1].lower() in womenWordsNoSpace:
                    maleFirst.append((words[i], words[i + 1]))
print(len(maleFirst),maleFirst)

11 [('king', 'queen'), ('man', 'mother'), ('man', 'mother'), ('prince', 'princess'), ('uncle', 'aunt'), ('uncle', 'aunt'), ('uncle', 'wife'), ('father', 'mother'), ('father', 'mother'), ('brother', 'sister'), ('brother', 'sister')]


In [16]:
sameLevel = {
    "siblings": {"brother", "sister"},
    "parent_child": {"father", "mother", "son", "daughter"},
    "spouses": {"husband", "wife"},
    "royalty": {"king", "queen", "prince", "princess"},
    "extended_family": {"uncle", "aunt", "nephew", "niece"},
    "personal_pronouns": {"he", "she", "him", "her", "his", "hers"},
    "titles": {"gentleman", "lady", "sir", "ma'am", "mr", "mrs", "madam"},
    "heroic_roles": {"hero", "heroine"},
    "social_roles": {"patriarch", "matriarch"},
    "general_plural": {"men", "women"},
}
flat = {word for group in sameLevel.values() for word in group}
def isSame(pair):
    return pair[0] in flat and pair[1] in flat
femaleFirst = [pair for pair in femaleFirst if isSame(pair)]

In [17]:
maleFirst = [pair for pair in maleFirst if isSame(pair)]
print(len(maleFirst))
print(len(femaleFirst))

9
3


# Sentiment analysis

In [18]:
fm = ', '.join(map(str, [item for sublist in menPara for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenPara for item in sublist]))
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
fscores = analyzer.polarity_scores(fw)
mscores = analyzer.polarity_scores(fm)
print(fscores,mscores)

{'neg': 0.061, 'neu': 0.707, 'pos': 0.232, 'compound': 1.0} {'neg': 0.049, 'neu': 0.703, 'pos': 0.248, 'compound': 1.0}


# TFIDF

In [19]:
from langdetect import detect
from langdetect import LangDetectException
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
#stemming the words
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_nested_list(nested_list):
    return [
        [
            ' '.join(stemmer.stem(word) for word in sentence.split())
            for sentence in sublist
        ]
        for sublist in nested_list
    ]
menParaOriginal = menPara
womenParaOriginal = womenPara
menPara = stem_nested_list(menPara)
womenPara = stem_nested_list(womenPara)
fm = ', '.join(map(str, [item for sublist in menPara for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenPara for item in sublist]))

In [21]:
tobeTFIDF = [" ".join(" ".join(row) for row in menPara)," ".join(" ".join(row) for row in womenPara)]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tobeTFIDF)
feature_names = vectorizer.get_feature_names_out()
dense_tfidf = tfidf_matrix.toarray()

In [22]:
#tfidf of female words
doc_index = 1
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores = sorted(word_score_pairs, key=lambda x: x[1], reverse=True)
top_300_words_fs = sorted_word_scores[:300]
top_300_words_f = [word for word, score in sorted_word_scores[:300]]

In [23]:
#tfidf of male words
doc_index = 0  
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs_m = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores_m = sorted(word_score_pairs_m, key=lambda x: x[1], reverse=True)
top_300_words_ms = sorted_word_scores_m[:300]
top_300_words_m = [word for word, score in sorted_word_scores_m[:300]]

In [24]:
#remove common words and gendered keywords
common = list(set(top_300_words_f) & set(top_300_words_m))
top_300_words_f = [word for word in top_300_words_f if word not in common]
top_300_words_f = [word for word in top_300_words_f if word not in womenWordsNoSpace]
female_300 = " ".join(top_300_words_f)
top_300_words_m = [word for word in top_300_words_m if word not in common]
top_300_words_m = [word for word in top_300_words_m if word not in menWordsNoSpace]
male_300 = " ".join(top_300_words_m)

In [25]:
print(female_300)
print(male_300)

susan stone miss natur wizard wick chariti tri food oop tv yen ball lullabi present saw usual beggar gold hobbi touch began cook inform music femal lost pen salesperson teresa alway garden group le right stay tast cruel dinner drink er ever fi five flower host hour hug mommi move nt pay practic pretend sleep spend step togeth watch without china danger demand har meat palac sale someon tour wilma arm bread dad draw ed elfi everyth feel festiv hear join lunch match ny ol pictur punish shop tabl tonight underlin va wi anger breakfast clock cold cross everywher fo found full ground hariti ir leg longer met pop power put quickli singer state su vhat warm weaker anoth anyth bridg care catch check dialog dog express ffi fine form held homework
guitar uncl film li question follow meet walk australia card corn mind american bought duffi ff grow speech tradit ami exampl great languag past stop tl yesterday favor kungfu nh ofth plant scientist window becom car die direct even grandmoth jennif ji

In [26]:
from collections import Counter
import re
mwords = male_300.split()
filteredMwords = [word for word in mwords if word in fm]
wordMcounts = Counter(filteredMwords)
topMwords = wordMcounts.most_common(40)

In [27]:
fwords = female_300.split()
filteredFwords = [word for word in fwords if word in fw]
wordFcounts = Counter(filteredFwords)
topFwords = wordFcounts.most_common(40)

In [28]:
from collections import Counter
import re

# Example word lists (assuming male_300 and female_300 are defined as strings)
mwords = male_300.split()  # Male words
fwords = female_300.split()  # Female words

# Filtered words for male
filteredMwords = [word for word in mwords if word in fm]
wordMcounts = Counter(filteredMwords)
topMwords = wordMcounts.most_common(40)
print("Top 40 Male Words:", topMwords)

# Filtered words for female
filteredFwords = [word for word in fwords if word in fw]
wordFcounts = Counter(filteredFwords)
topFwords = wordFcounts.most_common(40)
print("Top 40 Female Words:", topFwords)


Top 40 Male Words: [('guitar', 1), ('uncl', 1), ('film', 1), ('li', 1), ('question', 1), ('follow', 1), ('meet', 1), ('walk', 1), ('australia', 1), ('card', 1), ('corn', 1), ('mind', 1), ('american', 1), ('bought', 1), ('duffi', 1), ('ff', 1), ('grow', 1), ('speech', 1), ('tradit', 1), ('ami', 1), ('exampl', 1), ('great', 1), ('languag', 1), ('past', 1), ('stop', 1), ('tl', 1), ('yesterday', 1), ('favor', 1), ('kungfu', 1), ('nh', 1), ('ofth', 1), ('plant', 1), ('scientist', 1), ('window', 1), ('becom', 1), ('car', 1), ('die', 1), ('direct', 1), ('even', 1), ('grandmoth', 1)]
Top 40 Female Words: [('susan', 1), ('stone', 1), ('miss', 1), ('natur', 1), ('wizard', 1), ('wick', 1), ('chariti', 1), ('tri', 1), ('food', 1), ('oop', 1), ('tv', 1), ('yen', 1), ('ball', 1), ('lullabi', 1), ('present', 1), ('saw', 1), ('usual', 1), ('beggar', 1), ('gold', 1), ('hobbi', 1), ('touch', 1), ('began', 1), ('cook', 1), ('inform', 1), ('music', 1), ('femal', 1), ('lost', 1), ('pen', 1), ('salesperson'

# random forest

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
male_segments = fm.split(" ")  
female_segments = fw.split(" ")  
texts = male_segments + female_segments  
labels = [0] * len(male_segments) + [1] * len(female_segments)  
vectorizer = TfidfVectorizer()  
X = vectorizer.fit_transform(texts)  
feature_names = vectorizer.get_feature_names_out()

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, labels)


proba = clf.predict_proba(X)[:, 1]  # Probability of being female
female_contributions = np.dot(X.T.toarray(), proba)
female_importance = female_contributions / np.sum(X.toarray(), axis=0)
female_importance_df = pd.DataFrame({
    "Word": feature_names,
    "FemaleImportance": female_importance
}).sort_values(by="FemaleImportance", ascending=False)

print("Top words most related to Female:")
print(female_importance_df.head(10))

Top words most related to Female:
             Word  FemaleImportance
2702       wizard               1.0
2683         wick               1.0
180        beggar               1.0
1899          pop               1.0
2536           tv               1.0
2756          yen               1.0
1773          oop               1.0
803            fo               1.0
358         china               1.0
2104  salesperson               1.0


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
male_segments = fm.split(" ")  
female_segments = fw.split(" ")  
texts = male_segments + female_segments  
labels = [0] * len(male_segments) + [1] * len(female_segments)  
vectorizer = TfidfVectorizer()  
X = vectorizer.fit_transform(texts)  
feature_names = vectorizer.get_feature_names_out()

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, labels)

proba = clf.predict_proba(X)[:, 0]  # Probability of being male (class 0)


male_contributions = np.dot(X.T.toarray(), proba)
male_importance = male_contributions / np.sum(X.toarray(), axis=0)
male_importance_df = pd.DataFrame({
    "Word": feature_names,
    "MaleImportance": male_importance
}).sort_values(by="MaleImportance", ascending=False)

print("Top words most related to Male:")
print(male_importance_df.head(10))

Top words most related to Male:
             Word  MaleImportance
898          film             1.0
1163         hero             1.0
1796         mind             1.0
2895  traditional             1.0
160     australia             1.0
949     following             1.0
353          card             1.0
517          corn             1.0
3178      writing             1.0
184           ban             1.0


In [30]:
fm = ', '.join(map(str, [item for sublist in menParaOriginal for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenParaOriginal for item in sublist]))
ps = PorterStemmer()
words = word_tokenize(fw)
matching = [word for word in words if ps.stem(word) in {"cass", "aborigin","sungen"}]
matching

[]