In [None]:
from pypdf import PdfReader
import spacy
from collections import Counter
from transformers import pipeline
import os
import json
from googletrans import Translator
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
translator = Translator()
language = 'en'
start = 0 if language!='fr' else 1

In [None]:
def load_spacy_model(language):
    if language == 'EN':
        return spacy.load("en_core_web_md")
    elif language == 'FR':
        return spacy.load("fr_core_news_sm")
    elif language == 'ES':
        return spacy.load("es_core_news_sm")
    else:
        raise ValueError(f"Unsupported language: {language}")
        
def extract_text_from_pdf(pdf_path, start = 0):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages[start:]:
        text += page.extract_text() + " "
    return text

In [None]:
def analyze_text(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    return pos_counts
def get_author_name(text):
    qa_pipeline = pipeline("question-answering", model="timpal0l/mdeberta-v3-base-squad2")
    question = "Qui est l'auteur de l'article?"
    result = qa_pipeline(question=question, context=text[:500])
    return result['answer']

In [None]:
def return_preprocessed_text_from_pdf(pdf_path, language='EN', start=0):
    lines = extract_text_from_pdf(pdf_path, start).split('\n')
    pre_processed_text = []
    within_abstract = False
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if language != 'EN':
            pre_processed_text.append(line)
        else:
            if line.lower() == 'abstract':
                within_abstract = True
                continue
            elif line.lower() == 'references':
                within_abstract = False

            if within_abstract:
                pre_processed_text.append(line)
    return ' '.join(pre_processed_text).lower()

In [None]:
def tokenize_text(text, language="EN"):
    nltk_lang_map = {
        "EN": "english",
        "FR": "french",
        "ES": "spanish"
    }
    lang_code = nltk_lang_map.get(language.upper(), "english")
    return word_tokenize(text, language=lang_code)

In [None]:
def tokenize_text(text, language="EN"):
    nltk_lang_map = {
        "EN": "english",
        "FR": "french",
        "ES": "spanish"
    }
    lang_code = nltk_lang_map.get(language.upper(), "english")
    return word_tokenize(text, language=lang_code)

In [None]:
def calculate_ttr(text):
    tokens = tokenize_text(text)
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) if tokens else 0

def calculate_hapax_legomena(text):
    tokens = tokenize_text(text)
    word_counts = Counter(tokens)
    hapax_legomena = sum(1 for word in word_counts if word_counts[word] == 1)
    return hapax_legomena / len(tokens) if tokens else 0

def calculate_msl(text, nlp):
    doc = nlp(text)
    sentences = list(doc.sents)
    total_words = len([token.text for token in doc if not token.is_punct])
    return total_words / len(sentences) if sentences else 0

In [None]:
#involvement metrics
def calculate_involvement_rate(text, nlp):
    doc = nlp(text)
    total_words = len([token for token in doc if not token.is_punct])
    personal_pronouns = sum(1 for token in doc if token.pos_ == "PRON" and token.tag_ in {"PRP", "PRP$"})
    questions = sum(1 for token in doc if token.text == "?")
    coordination = sum(1 for token in doc if token.dep_ == "cc")
    return (personal_pronouns + questions + coordination) / total_words if total_words else 0

def calculate_informational_rate(text, nlp):
    doc = nlp(text)
    total_words = len([token for token in doc if not token.is_punct])
    noun_phrases = len(list(doc.noun_chunks))
    technical_terms = sum(1 for token in doc if token.pos_ == "NOUN" and len(token.text) > 6)
    references = sum(1 for token in doc if token.like_num or token.text.startswith("http") or token.text.startswith("@"))
    return (noun_phrases + technical_terms + references) / total_words if total_words else 0

def calculate_iir(text, nlp):
    involvement = calculate_involvement_rate(text, nlp)
    informational = calculate_informational_rate(text, nlp)
    return involvement / informational if informational else float('inf')

In [None]:
df = pd.read_csv("Metadata-GENDER.csv")
df['FileName'] = df['FileName'].astyp(str)
df['Paper_Language'] = df['Paper_Language'].astyp(str)
df['Paper_Subject'] = df['Paper_Subject'].astyp(str)
df['corpus'] = ""
df['ttr'] = np.nan
df['hapax'] = np.nan
df['msl'] = np.nan
df['involvement_rate'] = np.nan
df['informational_rate'] = np.nan
df['iir'] = np.nan

def process_pdfs_in_folder(folder_path, lang, sub):
    i = 1
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            match = re.search(r'(\d+)', filename)
            if match:
                file_number = str(match.group(1))
                print(f"Extracted file number: {i}")

                row = df[(df['FileName'].str.lower() == file_number.lower()) &
                         (df['Paper_Language'].str.lower() == lang.lower()) &
                         (df['Paper_Subject'].str.lower() == sub.lower())]
                nlp = load_spacy_model(lang)
                if not row.empty:
                    pdf_path = os.path.join(folder_path, filename)
                    pre_processed_text = return_preprocessed_text_from_pdf(pdf_path, lang)
                    df.loc[row.index, 'corpus'] = pre_processed_text
                    df['ttr'] = calculate_ttr(text)
                    df['hapax'] = calculate_hapax_legomena(text)
                    df['msl'] = calculate_msl(text, nlp)
                    df.loc[row.index, 'involvement_rate'] = calculate_involvement_rate(pre_processed_text, nlp)
                    df.loc[row.index, 'informational_rate'] = calculate_informational_rate(pre_processed_text, nlp)
                    df.loc[row.index, 'iir'] = calculate_iir(pre_processed_text, nlp)
            i = i + 1

In [None]:
folder_path = "D:/Text Analytics/Term Paper/dataset/EN-His"
print("EN-HIS")
process_pdfs_in_folder(folder_path, "EN", "HIS")

folder_path = "D:/Text Analytics/Term Paper/dataset/EN-Psy"
print("EN-PSY")
process_pdfs_in_folder(folder_path, "EN", "PSY")

folder_path = "D:/Text Analytics/Term Paper/dataset/FR-His"
print("FR-HIS")
process_pdfs_in_folder(folder_path, "FR", "HIS")

folder_path = "D:/Text Analytics/Term Paper/dataset/FR-Psy"
print("FR-PSY")
process_pdfs_in_folder(folder_path, "FR", "PSY")

folder_path = "D:/Text Analytics/Term Paper/dataset/ES-His"
print("ES-HIS")
process_pdfs_in_folder(folder_path, "ES", "HIS")

folder_path = "D:/Text Analytics/Term Paper/dataset/ES-Psy"
print("ES-PSY")
process_pdfs_in_folder(folder_path, "ES", "PSY")

In [None]:
output_file_path = "D:\Text Analytics\Term Paper\metrics_output.csv"
df_without_corpus = df.drop(columns=['corpus'])
df_without_corpus.to_csv(output_file_path, index=False)
print(f"DataFrame saved to {output_file_path}")

In [None]:
#graphs for involvement metrics
df_without_corpus = pd.read_csv("metrics_output.csv")
df_cleaned = df_without_corpus.replace([np.inf, -np.inf], np.nan)
df_cleaned = df_cleaned.dropna(subset=['ttr', 'hapax', 'msl', 'involvement_rate', 'informational_rate', 'iir'])
df_cleaned = df_cleaned[(df_cleaned[['ttr', 'hapax', 'msl', 'involvement_rate', 'informational_rate', 'iir']] != 0).all(axis=1)]
sns.set(style="whitegrid")
gender_palette = {'F': '#FF69B4', 'M': '#1E90FF'}

def create_violin_subplots(df_cleaned, category, typ, choice, metrics):
    fig, axes = plt.subplots(1, 3, figsize=(12, 6))
    if len(metrics) == 1:
        axes = [axes]
    for i, metric in enumerate(metrics):
        sns.violinplot(x=category, y=metric, hue="GENDER", data=df_cleaned, split=True, palette=gender_palette, ax=axes[i])
        axes[i].set_title(f"{metric.upper()} by {category}")
        axes[i].set_ylabel(metric.replace('_', ' ').title())
        axes[i].set_xlabel(category)
        axes[i].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig(f"violin-{typ}-{choice}.png")
    plt.show()
    plt.close()

def create_bar_subplots(df_cleaned, category, typ, choice, metrics):
    fig, axes = plt.subplots(1, 3, figsize=(12, 6))
    if len(metrics) == 1:
        axes = [axes]
    for i, metric in enumerate(metrics):
        df_grouped = df_cleaned.groupby([category, 'GENDER'])[metric].mean().reset_index()
        sns.barplot(x=category, y=metric, hue='GENDER', data=df_grouped, ax=axes[i], palette=gender_palette        )
        axes[i].set_title(f"{metric.upper()} by {category}")
        axes[i].set_ylabel(metric.replace('_', ' ').title())
        axes[i].set_xlabel(category)
        axes[i].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig(f"bar-{typ}-{choice}.png")
    plt.show()
    plt.close()

def create_single_plot(df_cleaned, category, typ, choice, metric, plot_type):
    plt.figure(figsize=(10, 6))
    if plot_type == 'violin':
        sns.violinplot(x=category, y=metric, hue="GENDER", data=df_cleaned, split=True, palette=gender_palette)
    else:
        df_grouped = df_cleaned.groupby([category, 'GENDER'])[metric].mean().reset_index()
        sns.barplot(x=category, y=metric, hue='GENDER', data=df_grouped, palette=gender_palette)
    plt.title(f"{metric.upper()} by {category}")
    plt.ylabel(metric.replace('_', ' ').title())
    plt.xlabel(category)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{plot_type}-{typ}-{choice}-{metric}.png")
    plt.show()
    plt.close()

def call_with_metrics(choice):
    if choice == 'all':
        metrics = ['involvement_rate', 'informational_rate', 'iir']
        for category, typ in [('Paper_Language', 'l'), ('Paper_Subject', 's')]:
            create_violin_subplots(df_cleaned, category, typ, choice, metrics)
            create_bar_subplots(df_cleaned, category, typ, choice, metrics)
    else:
        metric = 'iir'
        for category, typ in [('Paper_Language', 'l'), ('Paper_Subject', 's')]:
            create_single_plot(df_cleaned, category, typ, choice, metric, plot_type='violin')
            create_single_plot(df_cleaned, category, typ, choice, metric, plot_type='bar')

call_with_metrics('all')
call_with_metrics('one')