In [1]:
import fitz
def pdf_extract(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text
saudi = pdf_extract('saudi/Saudi_7.pdf')+" "+ pdf_extract('saudi/Saudi_8.pdf')+" "+ pdf_extract('saudi/Saudi_9.pdf')
iran = pdf_extract('iran/iran_7.pdf')+" "+ pdf_extract('iran/iran_8.pdf')+" "+pdf_extract('iran/iran_9.pdf')
tun = pdf_extract('Tunisia/Tun_7.pdf')+" "+pdf_extract('Tunisia/Tun_8.pdf')+" "+pdf_extract('Tunisia/Tun_9.pdf')
uzi = pdf_extract('Uzbekistan/Uzi_7.pdf')+" "+pdf_extract('Uzbekistan/Uzi_9.pdf')

In [2]:
textbooks = [saudi, iran, tun, uzi]

# Data Cleaning

In [3]:
import re
from langdetect import detect
from langdetect import LangDetectException
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
def clean_text(post):
    if detect(post) != 'en':
        return ''
    # Make posts lowercase
    post = post.lower()

    # Remove punctuation
    post = re.sub(r'[^a-zA-Z\s]', ' ', post)
    
    # Remove words with repeated letters
    #post = re.sub(r'([a-zA-Z])\1+', r'\1', post)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(post)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

In [5]:
cleanTextbooks = []
for i in textbooks:
    cleanTextbooks.append(clean_text(i))

In [6]:
#These have spaces, we use these to select the male and female words
menWords = [" man ", " boy ", " male ", " brother ", " father ", " son ", " husband ", " king ", " prince ", " uncle ", " nephew ", " he ", " him ", " his ", " gentleman ", " sir ", " mr. ", " hero ", " lord ", " patriarch ", " men "]
womenWords = [" woman ", " girl ", " female ", " sister ", " mother ", " daughter ", " wife ", " queen ", " princess ", " aunt ", " niece ", " she ", " her ", " hers ", " lady ", " ma'am "," madam ", " mrs. ", " ms. ", " miss ", " heroine ", " dame ", " matriarch ", " women "]
#These have no spaces, so these can be used to throw away words during TFIDF
menWordsNoSpace = ["man", "boy", "male", "brother", "father", "son", "husband", "king", "prince", "uncle", "nephew", "he", "him", "his", "gentleman", "sir", "mr", "hero", "lord", "patriarch", "men"]
womenWordsNoSpace = ["woman", "girl", "female", "sister", "mother", "daughter", "wife", "queen", "princess", "aunt", "niece", "she", "her", "hers", "lady", "ma'am","madam", "mrs", "heroine", "dame", "matriarch", "women"]

In [26]:
menPara = [[] for _ in range(4)]
womenPara = [[] for _ in range(4)]
for i in range(0, 4):
    text = cleanTextbooks[i]
    paragraphs = [text[j:j+100] for j in range(0, len(text), 100)]
    for paragraph in paragraphs:
        for j in menWords:
            if j in paragraph:
                menPara[i].append(paragraph)
        for j in womenWords:
            if j in paragraph:
                womenPara[i].append(paragraph)

# Occurence

In [27]:
men_count = sum(len(sublist) for sublist in menPara)
print(men_count)
women_count = sum(len(sublist) for sublist in womenPara)
print(women_count)

305
246


# Firstness

In [28]:
femaleFirst = []
for sublist in womenPara:  
    for paragraph in sublist:  
        if isinstance(paragraph, str):  
            words = paragraph.split()  
            for i in range(len(words) - 1): 
                if words[i].lower() in womenWordsNoSpace and words[i + 1].lower() in menWordsNoSpace:
                    femaleFirst.append((words[i], words[i + 1]))
print(len(femaleFirst),femaleFirst)

11 [('mother', 'brother'), ('mother', 'brother'), ('sister', 'father'), ('sister', 'father'), ('mother', 'father'), ('girl', 'boy'), ('mother', 'father'), ('queen', 'prince'), ('wife', 'husband'), ('mother', 'father'), ('mother', 'father')]


In [29]:
maleFirst = []
for sublist in menPara:  
    for paragraph in sublist:  
        if isinstance(paragraph, str):  
            words = paragraph.split()  
            for i in range(len(words) - 1):  
                if words[i].lower() in menWordsNoSpace and words[i + 1].lower() in womenWordsNoSpace:
                    maleFirst.append((words[i], words[i + 1]))
print(len(maleFirst),maleFirst)

28 [('father', 'mother'), ('father', 'mother'), ('father', 'mother'), ('brother', 'sister'), ('father', 'mother'), ('brother', 'sister'), ('father', 'mother'), ('brother', 'sister'), ('brother', 'sister'), ('brother', 'sister'), ('brother', 'sister'), ('men', 'women'), ('men', 'women'), ('king', 'queen'), ('male', 'female'), ('boy', 'girl'), ('boy', 'girl'), ('boy', 'girl'), ('father', 'mother'), ('father', 'mother'), ('brother', 'sister'), ('men', 'women'), ('men', 'women'), ('men', 'women'), ('father', 'mother'), ('boy', 'girl'), ('uncle', 'aunt'), ('uncle', 'aunt')]


In [30]:
sameLevel = {
    "siblings": {"brother", "sister"},
    "parent_child": {"father", "mother", "son", "daughter"},
    "spouses": {"husband", "wife"},
    "royalty": {"king", "queen", "prince", "princess"},
    "extended_family": {"uncle", "aunt", "nephew", "niece"},
    "personal_pronouns": {"he", "she", "him", "her", "his", "hers"},
    "titles": {"gentleman", "lady", "sir", "ma'am", "mr", "mrs", "madam"},
    "heroic_roles": {"hero", "heroine"},
    "social_roles": {"patriarch", "matriarch"},
    "general_plural": {"men", "women"},
}
flat = {word for group in sameLevel.values() for word in group}
def isSame(pair):
    return pair[0] in flat and pair[1] in flat
femaleFirst = [pair for pair in femaleFirst if isSame(pair)]

In [31]:
maleFirst = [pair for pair in maleFirst if isSame(pair)]
print(len(maleFirst))
print(len(femaleFirst))

23
10


# Sentiment analysis

In [32]:
fm = ', '.join(map(str, [item for sublist in menPara for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenPara for item in sublist]))
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
fscores = analyzer.polarity_scores(fw)
mscores = analyzer.polarity_scores(fm)
print(fscores,mscores)

{'neg': 0.08, 'neu': 0.742, 'pos': 0.178, 'compound': 0.9999} {'neg': 0.061, 'neu': 0.774, 'pos': 0.166, 'compound': 1.0}


# TFIDF

In [33]:
from langdetect import detect
from langdetect import LangDetectException
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
#stemming the words
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_nested_list(nested_list):
    return [
        [
            ' '.join(stemmer.stem(word) for word in sentence.split())
            for sentence in sublist
        ]
        for sublist in nested_list
    ]
menParaOriginal = menPara
womenParaOriginal = womenPara
menPara = stem_nested_list(menPara)
womenPara = stem_nested_list(womenPara)
fm = ', '.join(map(str, [item for sublist in menPara for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenPara for item in sublist]))

In [35]:
tobeTFIDF = [" ".join(" ".join(row) for row in menPara)," ".join(" ".join(row) for row in womenPara)]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tobeTFIDF)
feature_names = vectorizer.get_feature_names_out()
dense_tfidf = tfidf_matrix.toarray()

In [36]:
#tfidf of female words
doc_index = 1
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores = sorted(word_score_pairs, key=lambda x: x[1], reverse=True)
top_300_words_fs = sorted_word_scores[:300]
top_300_words_f = [word for word, score in sorted_word_scores[:300]]

In [37]:
#tfidf of male words
doc_index = 0  
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs_m = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores_m = sorted(word_score_pairs_m, key=lambda x: x[1], reverse=True)
top_300_words_ms = sorted_word_scores_m[:300]
top_300_words_m = [word for word, score in sorted_word_scores_m[:300]]

In [38]:
#remove common words and gendered keywords
common = list(set(top_300_words_f) & set(top_300_words_m))
top_300_words_f = [word for word in top_300_words_f if word not in common]
top_300_words_f = [word for word in top_300_words_f if word not in womenWordsNoSpace]
female_300 = " ".join(top_300_words_f)
top_300_words_m = [word for word in top_300_words_m if word not in common]
top_300_words_m = [word for word in top_300_words_m if word not in menWordsNoSpace]
male_300 = " ".join(top_300_words_m)

In [39]:
print(female_300)
print(male_300)

miss translat nanci tongu countri must enjoy hate dear tabl angri bought tea come everi mom rememb stay turn hear ladi adj championship show bad beauti chore dad elizabeth futur ii kind list three wonder afraid cool gave gymnast march share unhappi celebr colleg competit cost decid finish gap high ice much programm sabah sever smoke stand statu street togeth usual uzbekistan weekend worri yesterday along anyon boss claus dynasti flower insid interview milk mo natali nd ning program remot rniga role third unkind window wrist aisha babi buy care cold complaint constitut correct cream england english even food form gener grade holiday housework hurt lenient market minut modul pair palac point present price pronunci pupil rel repair royal small someon speak still teen th tooth un verb adel age arriv bag ball belong breakfast britain british buckingham comfort convers differ dress eat edu elder
sg combo ali watch uncl drive next saw littl adject nemo phone sit doctor run ahm back put albrec

In [40]:
from collections import Counter
import re
mwords = male_300.split()
filteredMwords = [word for word in mwords if word in fm]
wordMcounts = Counter(filteredMwords)
topMwords = wordMcounts.most_common(40)

In [41]:
fwords = female_300.split()
filteredFwords = [word for word in fwords if word in fw]
wordFcounts = Counter(filteredFwords)
topFwords = wordFcounts.most_common(40)

In [42]:
from collections import Counter
import re

# Example word lists (assuming male_300 and female_300 are defined as strings)
mwords = male_300.split()  # Male words
fwords = female_300.split()  # Female words

# Filtered words for male
filteredMwords = [word for word in mwords if word in fm]
wordMcounts = Counter(filteredMwords)
topMwords = wordMcounts.most_common(40)
print("Top 40 Male Words:", topMwords)

# Filtered words for female
filteredFwords = [word for word in fwords if word in fw]
wordFcounts = Counter(filteredFwords)
topFwords = wordFcounts.most_common(40)
print("Top 40 Female Words:", topFwords)


Top 40 Male Words: [('sg', 1), ('combo', 1), ('ali', 1), ('watch', 1), ('uncl', 1), ('drive', 1), ('next', 1), ('saw', 1), ('littl', 1), ('adject', 1), ('nemo', 1), ('phone', 1), ('sit', 1), ('doctor', 1), ('run', 1), ('ahm', 1), ('back', 1), ('put', 1), ('albrecht', 1), ('believ', 1), ('cell', 1), ('friendli', 1), ('hors', 1), ('joe', 1), ('tourist', 1), ('charl', 1), ('nation', 1), ('store', 1), ('student', 1), ('taxi', 1), ('albert', 1), ('histori', 1), ('side', 1), ('test', 1), ('becom', 1), ('game', 1), ('happi', 1), ('never', 1), ('offic', 1), ('park', 1)]
Top 40 Female Words: [('miss', 1), ('translat', 1), ('nanci', 1), ('tongu', 1), ('countri', 1), ('must', 1), ('enjoy', 1), ('hate', 1), ('dear', 1), ('tabl', 1), ('angri', 1), ('bought', 1), ('tea', 1), ('come', 1), ('everi', 1), ('mom', 1), ('rememb', 1), ('stay', 1), ('turn', 1), ('hear', 1), ('ladi', 1), ('adj', 1), ('championship', 1), ('show', 1), ('bad', 1), ('beauti', 1), ('chore', 1), ('dad', 1), ('elizabeth', 1), ('fut

# random forest

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
male_segments = fm.split(" ")  
female_segments = fw.split(" ")  
texts = male_segments + female_segments  
labels = [0] * len(male_segments) + [1] * len(female_segments)  
vectorizer = TfidfVectorizer()  
X = vectorizer.fit_transform(texts)  
feature_names = vectorizer.get_feature_names_out()

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, labels)


proba = clf.predict_proba(X)[:, 1]  # Probability of being female
female_contributions = np.dot(X.T.toarray(), proba)
female_importance = female_contributions / np.sum(X.toarray(), axis=0)
female_importance_df = pd.DataFrame({
    "Word": feature_names,
    "FemaleImportance": female_importance
}).sort_values(by="FemaleImportance", ascending=False)

print("Top words most related to Female:")
print(female_importance_df.head(10))

Top words most related to Female:
          Word  FemaleImportance
1018      ladi               1.0
1180      miss               1.0
1901     tongu               1.0
573      enjoy               1.0
95       angri               1.0
828       hate               1.0
1921  translat               1.0
1220     nanci               1.0
1834       tea               1.0
244     bought               1.0


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
male_segments = fm.split(" ")  
female_segments = fw.split(" ")  
texts = male_segments + female_segments  
labels = [0] * len(male_segments) + [1] * len(female_segments)  
vectorizer = TfidfVectorizer()  
X = vectorizer.fit_transform(texts)  
feature_names = vectorizer.get_feature_names_out()

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, labels)

proba = clf.predict_proba(X)[:, 0]  # Probability of being male (class 0)


male_contributions = np.dot(X.T.toarray(), proba)
male_importance = male_contributions / np.sum(X.toarray(), axis=0)
male_importance_df = pd.DataFrame({
    "Word": feature_names,
    "MaleImportance": male_importance
}).sort_values(by="MaleImportance", ascending=False)

print("Top words most related to Male:")
print(male_importance_df.head(10))

Top words most related to Male:
          Word  MaleImportance
1915       toy             1.0
1674      side             1.0
1242      nemo             1.0
1331      page             1.0
26      adject             1.0
61         ali             1.0
310       cell             1.0
965        joe             1.0
869       hors             1.0
59    albrecht             1.0


In [45]:
fm = ', '.join(map(str, [item for sublist in menParaOriginal for item in sublist]))
fw = ', '.join(map(str, [item for sublist in womenParaOriginal for item in sublist]))
ps = PorterStemmer()
words = word_tokenize(fw)
matching = [word for word in words if ps.stem(word) in {"cass", "aborigin","sungen"}]
matching

[]