# Data extraction

In [None]:
import fitz

def pdf_extract(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    
    return text

In [None]:
#Read all the PDF files


In [None]:
#Put them into a list
textbooks = []

# Data Cleaning

In [None]:
import re
from langdetect import detect
from langdetect import LangDetectException

In [None]:
def clean_text_lower(post):
    if detect(post) != 'en':
        return ''
    # Make posts lowercase
    post = post.lower()

    # Remove punctuation
    post = re.sub(r'[^a-zA-Z\s]', ' ', post)
    
    # Remove words with repeated letters
    #post = re.sub(r'([a-zA-Z])\1+', r'\1', post)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(post)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

In [None]:
cleanTextbooks = []
for i in textbooks:
    cleanTextbooks.append(clean_text(i))

In [None]:
#These have spaces, we use these to select the male and female words
menWords = [" man ", " boy ", " male ", " brother ", " father ", " son ", " husband ", " king ", " prince ", " uncle ", " nephew ", " he ", " him ", " his ", " gentleman ", " sir ", " mr. ", " hero ", " lord ", " patriarch ", " men "]
womenWords = [" woman ", " girl ", " female ", " sister ", " mother ", " daughter ", " wife ", " queen ", " princess ", " aunt ", " niece ", " she ", " her ", " hers ", " lady ", " ma'am "," madam ", " mrs. ", " ms. ", " miss ", " heroine ", " dame ", " matriarch ", " women "]
#These have no spaces, so these can be used to throw away words during TFIDF
menWordsNoSpace = ["man", "boy", "male", "brother", "father", "son", "husband", "king", "prince", "uncle", "nephew", "he", "him", "his", "gentleman", "sir", "mr", "hero", "lord", "patriarch", "men"]
womenWordsNoSpace = ["woman", "girl", "female", "sister", "mother", "daughter", "wife", "queen", "princess", "aunt", "niece", "she", "her", "hers", "lady", "ma'am","madam", "mrs", "heroine", "dame", "matriarch", "women"]

In [None]:
menPara = [[] for _ in range(18)]
womenPara = [[] for _ in range(18)]
for i in range(0, 18):
    text = cleanTextbooks[i]
    paragraphs = [text[j:j+100] for j in range(0, len(text), 100)]
    for paragraph in paragraphs:
        for j in menWords:
            if j in paragraph:
                menPara[i].append(paragraph)
        for j in womenWords:
            if j in paragraph:
                womenPara[i].append(paragraph)

# Occurence

In [None]:
men_count = sum(len(sublist) for sublist in menPara)
print(men_count)
women_count = sum(len(sublist) for sublist in womenPara)
print(women_count)

# Firstness

In [None]:
femaleFirst = []
for paragraph in womenPara:
    words = paragraph.split()  # Split paragraph into words
    for i in range(len(words) - 1):  # Iterate up to the second last word
        if words[i].lower() in womenWordsNoSpace and words[i + 1].lower() in menWordsNoSpace:
            femaleFirst.append(words[i] + ' ' + words[i + 1])
print(len(femaleFirst),femaleFirst)

In [None]:
maleFirst = []
for paragraph in menPara:
    words = paragraph.split()  
    for i in range(len(words) - 1):  
        if words[i].lower() in menWordsNoSpace and words[i + 1].lower() in womenWordsNoSpace:
            maleFirst.append(words[i] + ' ' + words[i + 1])
print(len(maleFirst),maleFirst)

# TFIDF

In [None]:
from langdetect import detect
from langdetect import LangDetectException
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

In [None]:
tobeTFIDF = [" ".join(" ".join(row) for row in menPara)," ".join(" ".join(row) for row in womenPara)]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tobeTFIDF)
feature_names = vectorizer.get_feature_names_out()
dense_tfidf = tfidf_matrix.toarray()

In [None]:
#tfidf of female words
doc_index = 1
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores = sorted(word_score_pairs, key=lambda x: x[1], reverse=True)
top_300_words_fs = sorted_word_scores[:300]
top_300_words_f = [word for word, score in sorted_word_scores[:300]]

In [None]:
#tfidf of male words
doc_index = 0  
doc_tfidf_scores = dense_tfidf[doc_index]
word_score_pairs_m = list(zip(feature_names, doc_tfidf_scores))
sorted_word_scores_m = sorted(word_score_pairs_m, key=lambda x: x[1], reverse=True)
top_300_words_ms = sorted_word_scores_m[:300]
top_300_words_m = [word for word, score in sorted_word_scores_m[:300]]

In [None]:
#remove common words and gendered keywords
common = list(set(top_100_words_f) & set(top_100_words_m))
top_300_words_f = [word for word in top_100_words_f if word not in common]
top_300_words_f = [word for word in top_100_words_f if word not in womenWordsNoSpace]
female_300 = " ".join(top_300_words_f)
top_300_words_m = [word for word in top_100_words_m if word not in common]
top_300_words_m = [word for word in top_100_words_m if word not in menWordsNoSpace]
male_300 = " ".join(top_300_words_m)

In [None]:
print(female_300)
print(male_300)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
fscores = analyzer.polarity_scores(female_300)
mscores = analyzer.polarity_scores(male_300)