In [13]:
from pypdf import PdfReader
import spacy
from collections import Counter
from transformers import pipeline
import os
import json
from googletrans import Translator
import nltk
from nltk.tokenize import word_tokenize

In [14]:
translator = Translator()
language = 'en'
start = 0 if language!='fr' else 1

In [15]:
def extract_text_from_pdf(pdf_path, start = 0):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages[start:]:
        text += page.extract_text() + " "
    return text
def analyze_text(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    return pos_counts

In [16]:
def return_preprocessed_text_from_pdf(pdf_path):
    t = extract_text_from_pdf(pdf_path, start).split('\n')
    pre_processed_text = []
    index = -1
    for i in t:
        if language!= 'en':
            pre_processed_text.append(i)
        else:
            if index ==0:
                pre_processed_text.append(i)
            if i=='Abstract':
                index = 0
            if i=='References':
                index = -1
    pre_processed_text = ' '.join(pre_processed_text)
    return pre_processed_text.lower()

In [17]:
import pandas as pd

# Load the Excel file
xlsx_path = "Metadata.xlsx"
excel_data = pd.ExcelFile(xlsx_path)

# Dictionary to store gender arrays for each sheet
gender_data = {}

for sheet in excel_data.sheet_names:
    
    df = excel_data.parse(sheet)
    
    # Drop rows where Gender or FileName is missing
    df = df.dropna(subset=["Gender", "FileName"])
    
    # Ensure FileName is integer for consistency
    df["FileName"] = df["FileName"].astype(int)
    
    male_files = df[df["Gender"] == "M"]["FileName"].tolist()
    female_files = df[df["Gender"] == "F"]["FileName"].tolist()
    
    gender_data[sheet] = {
        "Male": male_files,
        "Female": female_files
    }

# Print gender arrays for each sheet
for sheet, genders in gender_data.items():
    print(f"Sheet: {sheet}")
    print("  Male:", genders["Male"])
    print("  Female:", genders["Female"])
    print()


Sheet: EN-His
  Male: [1, 4, 5, 7, 8, 9, 11, 12, 14, 16, 19, 20, 21, 22, 23, 24]
  Female: [2, 3, 6, 10, 13, 15, 17, 18, 25]

Sheet: EN-Psy
  Male: [1, 2, 3, 4, 10, 14, 16, 17, 19, 20, 23, 24, 25, 26, 27, 28, 29]
  Female: [5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 21, 22, 30]

Sheet: FR-His
  Male: [2, 9, 12, 16, 18, 19, 21, 23, 24]
  Female: [1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 17, 20, 22, 25]

Sheet: FR-Psy
  Male: [1, 2, 3, 8, 9, 10, 11, 12, 14, 15, 20, 21, 22, 24, 25, 26]
  Female: [4, 5, 6, 7, 13, 16, 17, 18, 19, 23, 27]

Sheet: ES-His
  Male: [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25]
  Female: [2, 13, 14, 16]

Sheet: ES-Psy
  Male: [3, 4, 8, 12, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25]
  Female: [1, 2, 5, 6, 7, 9, 10, 11, 13, 21, 24]



In [18]:
Dataset = {}
for sheet, genders in gender_data.items():
    Dataset[sheet] = {}
    print(sheet.split('-')[0])
    if(sheet.split('-')[0])=='EN':
        nlp = spacy.load("en_core_web_sm") 
        start = 0
    elif sheet.split('-')[0] == 'FR':
        nlp = spacy.load("fr_core_news_sm")
        start = 1
    else:
        nlp = spacy.load("es_core_news_sm")
        start = 1
    male = genders['Male']
    female = genders['Female']
    Dataset[sheet]['Male'] = []
    Dataset[sheet]['Female'] = []
    for file in male:
        if os.path.exists('./' + sheet + '/' + str(file) + '.pdf'):
            temp = extract_text_from_pdf('./' + sheet + '/' + str(file) + '.pdf', start)
            Dataset[sheet]['Male'].append([temp, file])
    for file in female:
        if os.path.exists('./' + sheet + '/' + str(file) + '.pdf'):
            temp = extract_text_from_pdf('./' + sheet + '/' + str(file) + '.pdf', start)
            Dataset[sheet]['Female'].append([temp, file])

EN
EN
FR
FR
ES


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)
Ignoring wrong po

ES


In [19]:
def tokenize_text(text, language):
    return word_tokenize(text, language=language)
def calculate_ttr(text, language):
    tokens = tokenize_text(text, language)
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) if tokens else 0

def calculate_hapax_legomena(text, language):
    tokens = tokenize_text(text, language)
    word_counts = Counter(tokens)
    hapax_legomena = sum(1 for word in word_counts if word_counts[word] == 1)
    return hapax_legomena / len(tokens) if tokens else 0

def calculate_msl(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    total_words = len([token.text for token in doc if not token.is_punct])
    return total_words / len(sentences) if sentences else 0

def flesch_Kincaid_Readability_Score(text):
    doc = nlp(text)
    total_sentences = len(list(doc.sents))
    total_words = len([token.text for token in doc if not token.is_punct])
    total_syllables = sum([len(list(token._.syllables)) for token in doc if not token.is_punct])
    
    if total_words == 0 or total_sentences == 0:
        return 0
    
    score = 206.835 - (1.015 * (total_words / total_sentences)) - (84.6 * (total_syllables / total_words))
    return score

def pronoun_usage(text):
    doc = nlp(text)
    pronouns = [token.text for token in doc if token.pos_ == "PRON"]
    pronoun_counts = Counter(pronouns)
    return dict(pronoun_counts)
def chi_square_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct]
    word_counts = Counter(tokens)
    total_words = len(tokens)
    expected_freq = total_words / len(word_counts) if word_counts else 0
    chi_square = sum((observed - expected_freq) ** 2 / expected_freq for observed in word_counts.values() if observed > 0)
    return chi_square

In [58]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from gender_guesser.detector import Detector

gender_detector = Detector()
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

def extract_in_text_citations(text):
    """Extract both author-year and numeric citations"""
    citations = []
    author_year = re.findall(r'\(([A-Z][a-z]+(?:,\s\d{4}| et al., \d{4}| & [A-Z][a-z]+, \d{4}))\)', text)
    numeric = re.findall(r'\[(\d+(?:-\d+)?)\]', text)
    return author_year + numeric

def get_first_author(reference_text):
    inputs = tokenizer(reference_text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    authors = []
    current_author = []
    
    for token, pred in zip(tokens, predictions[0].numpy()):
        label = model.config.id2label[pred]
        if label == "B-PER":
            if current_author:
                authors.append(" ".join(current_author))
                break 
            current_author = [token.replace("##", "")]
        elif label == "I-PER" and current_author:
            current_author.append(token.replace("##", ""))
    
    return authors[0] if authors else None

def analyze_citations(text, language="english"):
    citations = extract_in_text_citations(text)
    
    # Define hedging phrases for each supported language
    hedging_phrases = {
        'english': {'suggest', 'may', 'might', 'could', 'possibly'},
        'french': {'suggère', 'peut', 'pourrait', 'éventuellement', 'probablement'},
        'spanish': {'sugiere', 'puede', 'podría', 'posiblemente', 'quizás'}
    }
    
    # Select the appropriate set of phrases based on the language parameter
    selected_phrases = hedging_phrases.get(language.lower(), hedging_phrases['english'])
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    hedging_cites = 0
    
    for sent in sentences:
        # Check if the sentence contains any citation
        if any(cite in sent for cite in citations):
            # Check for presence of any hedging phrase in the sentence (case-insensitive)
            if any(phrase in sent.lower() for phrase in selected_phrases):
                hedging_cites += 1
    
    # Calculate the Citation Uncertainty Rate (CUR)
    cur = hedging_cites / len(citations) if citations else 0
    
    return {
        'total_citations': len(citations),
        'cur': cur
    }

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
for sheet in Dataset:
    if(sheet.split('-')[0])=='EN':
        language = "english"
        nlp = spacy.load("en_core_web_sm")
    elif sheet.split('-')[0] == 'FR':
        language = "french"
        nlp = spacy.load("fr_core_news_sm")
    else:
        language = "spanish"
        nlp = spacy.load("es_core_news_sm")
    ttr_m = 0
    ttr_f = 0
    hapax_legomena_m = 0
    hapax_legomena_f = 0
    msl_m = 0
    msl_f = 0
    fkrs_m = 0
    fkrs_f = 0
    cst_m = 0
    cst_f = 0
    cur_f = 0
    cur_m = 0
    print(sheet)
    for text in Dataset[sheet]['Male']:
        ttr_m += calculate_ttr(text[0], language)
        hapax_legomena_m += calculate_hapax_legomena(text[0],  language)
        msl_m += calculate_msl(text[0])
        cur_m += analyze_citations(text[0], language)['cur']
        #fkrs_m += flesch_Kincaid_Readability_Score(text)
        cst_m += chi_square_text(text[0])
    for text in Dataset[sheet]['Female']:
        ttr_f += calculate_ttr(text[0], language)
        hapax_legomena_f += calculate_hapax_legomena(text[0],  language)
        msl_f += calculate_msl(text[0])
        #fkrs_f += flesch_Kincaid_Readability_Score(text)
        cst_f += chi_square_text(text[0])
        cur_f += analyze_citations(text[0],language)['cur']
    ttr_m = ttr_m/len(Dataset[sheet]['Male'])
    hapax_legomena_m = hapax_legomena_m/len(Dataset[sheet]['Male'])
    msl_m = msl_m/len(Dataset[sheet]['Male'])
    #fkrs_m = fkrs_m/len(Dataset[sheet]['Male'])
    cst_m = cst_m/len(Dataset[sheet]['Male'])
    cur_m = cur_m/len(Dataset[sheet]['Male'])
    ttr_f = ttr_f/len(Dataset[sheet]['Female'])
    hapax_legomena_f = hapax_legomena_f/len(Dataset[sheet]['Female'])
    msl_f = msl_f/len(Dataset[sheet]['Female'])
    #fkrs_f = fkrs_f/len(Dataset[sheet]['Female'])
    cst_f = cst_f/len(Dataset[sheet]['Female'])
    cur_f = cur_m/len(Dataset[sheet]['Female'])
    print("TTR:- ",ttr_m,ttr_f)
    print("Hapax Legomena:- ",hapax_legomena_m,hapax_legomena_f)
    print("MSL:- ",msl_m,msl_f)
    print("CUR:-", cur_f, cur_m)
    print("Chi Square Text:- ", cst_m,cst_f)


EN-His
TTR:-  0.3248142880787969 0.2714959907416341
Hapax Legomena:-  0.25043326068202126 0.19665579673174652
MSL:-  16.09588387184193 21.9951025394151
CUR:- 0.3958333333333333 0.04398148148148148
Chi Square Text:-  511886.1919102939 668526.6825014201
EN-Psy
TTR:-  0.21261659129623015 0.2131323496472725
Hapax Legomena:-  0.12623788183266757 0.12657058226128257
MSL:-  20.616088359153014 21.547548514604834
CUR:- 0.12357272038626946 0.009505593875866882
Chi Square Text:-  437174.03284629446 573672.1753473484
FR-His
TTR:-  0.23375645011856594 0.2530662466288745
Hapax Legomena:-  0.149179473271893 0.16478087087262272
MSL:-  16.943398788426375 16.801288622327277
CUR:- 0.0 0.0
Chi Square Text:-  314139.21807099873 272418.3990306876
FR-Psy
TTR:-  0.2241803381696998 0.20460729131691693
Hapax Legomena:-  0.13740116603071093 0.12264160162581268
MSL:-  16.521578359310404 17.853652820681223
CUR:- 0.16168776864403797 0.014698888058548907
Chi Square Text:-  258327.49852094435 313955.9297096981
ES-His

In [25]:
import pandas as pd

results = []

for sheet in Dataset:
    prefix = sheet.split('-')[0]
    if prefix == 'EN':
        language = "english"
        nlp = spacy.load("en_core_web_sm")
    elif prefix == 'FR':
        language = "french"
        nlp = spacy.load("fr_core_news_sm")
    else:
        language = "spanish"
        nlp = spacy.load("es_core_news_sm")

    print(f"Processing sheet: {sheet}")

    for gender in ['Male', 'Female']:
        for file_name, text in enumerate(Dataset[sheet][gender]):
            result = {
                'Sheet': sheet,
                'File': str(text[1])+'.pdf',  # Or replace with actual filename if available
                'Gender': gender,
                'TTR': calculate_ttr(text[0], language),
                'Hapax_Legomena': calculate_hapax_legomena(text[0], language),
                'MSL': calculate_msl(text[0]),
                'CST': chi_square_text(text[0]),
                'CUR': analyze_citations(text[0], language)['cur'],
            }
            results.append(result)

# Create DataFrame
df_results = pd.DataFrame(results)

#Optional: Save to CSV
df_results.to_csv("file_level_text_metrics.csv", index=False)


Processing sheet: EN-His
Processing sheet: EN-Psy
Processing sheet: FR-His
Processing sheet: FR-Psy
Processing sheet: ES-His
Processing sheet: ES-Psy


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
