# Data extraction

In [None]:
import fitz

def pdf_extract(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    
    return text

In [1]:
#Read all the PDF files


In [None]:
#Put them into a list
textbooks = []

# Data Cleaning

In [None]:
import re
from langdetect import detect
from langdetect import LangDetectException

In [None]:
def clean_text_lower(post):
    if detect(post) != 'en':
        return ''
    # Make posts lowercase
    post = post.lower()

    # Remove punctuation
    post = re.sub(r'[^a-zA-Z\s]', ' ', post)
    
    # Remove words with repeated letters
    #post = re.sub(r'([a-zA-Z])\1+', r'\1', post)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(post)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

In [None]:
cleanTextbooks = []
for i in textbooks:
    cleanTextbooks.append(clean_text(i))

In [None]:
#These have spaces, we use these to select the male and female words
menWords = [" man ", " boy ", " male ", " brother ", " father ", " son ", " husband ", " king ", " prince ", " uncle ", " nephew ", " he ", " him ", " his ", " gentleman ", " sir ", " mr. ", " hero ", " lord ", " patriarch ", " men "]
womenWords = [" woman ", " girl ", " female ", " sister ", " mother ", " daughter ", " wife ", " queen ", " princess ", " aunt ", " niece ", " she ", " her ", " hers ", " lady ", " ma'am "," madam ", " mrs. ", " ms. ", " miss ", " heroine ", " dame ", " matriarch ", " women "]
#These have no spaces, so these can be used to throw away words during TFIDF
menWordsNoSpace = ["man", "boy", "male", "brother", "father", "son", "husband", "king", "prince", "uncle", "nephew", "he", "him", "his", "gentleman", "sir", "mr", "hero", "lord", "patriarch", "men"]
womenWordsNoSpace = ["woman", "girl", "female", "sister", "mother", "daughter", "wife", "queen", "princess", "aunt", "niece", "she", "her", "hers", "lady", "ma'am","madam", "mrs", "heroine", "dame", "matriarch", "women"]

In [None]:
menPara = [[] for _ in range(18)]
womenPara = [[] for _ in range(18)]
for i in range(0, 18):
    text = cleanTextbooks[i]
    paragraphs = [text[j:j+100] for j in range(0, len(text), 100)]
    for paragraph in paragraphs:
        for j in menWords:
            if j in paragraph:
                menPara[i].append(paragraph)
        for j in womenWords:
            if j in paragraph:
                womenPara[i].append(paragraph)

# Occurence

In [None]:
men_count = sum(len(sublist) for sublist in menPara)
print(men_count)
women_count = sum(len(sublist) for sublist in womenPara)
print(women_count)

# Firstness

In [None]:
femaleFirst = []
for paragraph in womenPara:
    words = paragraph.split()  # Split paragraph into words
    for i in range(len(words) - 1):  # Iterate up to the second last word
        if words[i].lower() in womenWordsNoSpace and words[i + 1].lower() in menWordsNoSpace:
            femaleFirst.append(words[i] + ' ' + words[i + 1])
print(len(femaleFirst),femaleFirst)

In [None]:
maleFirst = []
for paragraph in menPara:
    words = paragraph.split()  
    for i in range(len(words) - 1):  
        if words[i].lower() in menWordsNoSpace and words[i + 1].lower() in womenWordsNoSpace:
            maleFirst.append(words[i] + ' ' + words[i + 1])
print(len(maleFirst),maleFirst)

# TFIDF

In [None]:
%matplotlib inline
import string
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer='word',
                        sublinear_tf=False,
                        max_features=500,
                        tokenizer=nltk.word_tokenize)

In [None]:
tdidf_men = tfidf.fit(cleanMenParaList)
inds = np.argsort(tfidf.idf_)[::-1][:100]
top_IDF_tokens_men = [list(tfidf.vocabulary_)[ind] for ind in inds]
top_IDF_scores_men = tfidf.idf_[inds]
print(top_IDF_tokens_men)

In [None]:
tdidf_women = tfidf.fit(cleanWomenParaList)
inds = np.argsort(tfidf.idf_)[::-1][:100]
top_IDF_tokens_women = [list(tfidf.vocabulary_)[ind] for ind in inds]
top_IDF_scores_women = tfidf.idf_[inds]
print(top_IDF_tokens_women)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
pos_dict_men = {}
pos_dict_women={}

def get_pos(word):
    # Process the word with spaCy
    doc = nlp(word)
    for token in doc:
        return token.text, token.pos_

for i in top_IDF_tokens_men:
    result = get_pos(i)
    pos = result[1]  # POS tag
    word_text = result[0]
    
    # Add word to the list for its POS in the dictionary
    if pos not in pos_dict_men:
        pos_dict_men[pos] = []
    pos_dict_men[pos].append(word_text)

for i in top_IDF_tokens_women:
    result = get_pos(i)
    pos = result[1]  # POS tag
    word_text = result[0]
    
    # Add word to the list for its POS in the dictionary
    if pos not in pos_dict_women:
        pos_dict_women[pos] = []
    pos_dict_women[pos].append(word_text)

# Print each POS category and its list of words
include_pos = {"NOUN", "VERB", "ADJ", "ADV"}
for pos, words in pos_dict_men.items():
    if pos in include_pos:  
        print(f"{pos}: {words}")

In [None]:
for pos, words in pos_dict_women.items():
    if pos in include_pos:  
        print(f"{pos}: {words}")