In [11]:
from pypdf import PdfReader
import spacy
from collections import Counter
from transformers import pipeline
import os
import json
from googletrans import Translator
import nltk
from nltk.tokenize import word_tokenize


In [12]:
translator = Translator()
language = 'en'
start = 0 if language!='fr' else 1

In [13]:
if language == 'en':
    nlp = spacy.load("en_core_web_sm") 
elif language == 'fr':
    nlp = spacy.load("en_core_web_sm")
def extract_text_from_pdf(pdf_path, start = 0):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages[start:]:
        text += page.extract_text() + " "
    return text
def analyze_text(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    return pos_counts
def get_author_name(text):
    qa_pipeline = pipeline("question-answering", model="timpal0l/mdeberta-v3-base-squad2")
    question = "Qui est l'auteur de l'article?"#"Who is the author of the paper?" 
    result = qa_pipeline(question=question, context=text[:500])
    return result['answer']

In [14]:
def return_preprocessed_text_from_pdf(pdf_path):
    t = extract_text_from_pdf(pdf_path, start).split('\n')
    pre_processed_text = []
    index = -1
    for i in t:
        if language!= 'en':
            pre_processed_text.append(i)
        else:
            if index ==0:
                pre_processed_text.append(i)
            if i=='Abstract':
                index = 0
            if i=='References':
                index = -1
    pre_processed_text = ' '.join(pre_processed_text)
    return pre_processed_text.lower()

1. Scale
2. Role of transformers -> 

In [111]:
# def chunk_text(text, chunk_size=5000):  # Adjust size based on API limits
#     return [text[i:i+chunk_size] for i in range(0, 5000, chunk_size)]

# chunks = chunk_text(pre_processed_text)
# translated_chunks = [ (await translator.translate(chunk, src=language, dest='en')).text for chunk in chunks ]

# pre_processed_text = " ".join(translated_chunks)

In [67]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

prompt = f"""Extract and structure the following academic paper into sections. 

Format as:
=== SECTION NAME ===
Content...

Input text:
{pre_processed_text[:200]}
"""

# Generate response
response = pipe(
    prompt,
    max_new_tokens=2000,
    temperature=0.3,
    do_sample=True
)

# Print structured output
print(response[0]['generated_text'])

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [112]:
# pos_counts = analyze_text(pre_processed_text)

In [113]:
# pos_counts

Counter({'PROPN': 233,
         'PUNCT': 151,
         'ADP': 108,
         'NOUN': 101,
         'DET': 67,
         'VERB': 54,
         'ADJ': 36,
         'PRON': 30,
         'NUM': 29,
         'ADV': 25,
         'AUX': 20,
         'CCONJ': 16,
         'PART': 15,
         'SCONJ': 13,
         'X': 3,
         'SYM': 2,
         'SPACE': 1})

In [15]:
def tokenize_text(text, language="english"):
    return word_tokenize(text, language=language)
def calculate_ttr(text):
    tokens = tokenize_text(text)
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) if tokens else 0

def calculate_hapax_legomena(text):
    tokens = tokenize_text(text)
    word_counts = Counter(tokens)
    hapax_legomena = sum(1 for word in word_counts if word_counts[word] == 1)
    return hapax_legomena / len(tokens) if tokens else 0

def calculate_msl(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    total_words = len([token.text for token in doc if not token.is_punct])
    return total_words / len(sentences) if sentences else 0

In [98]:
# calculate_msl(pre_processed_text)

25.91041162227603

In [18]:
def process_pdfs_in_folder(folder_path, language='en'):
    corpus = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            preprocessed_text = return_preprocessed_text_from_pdf(pdf_path)
            corpus.append(preprocessed_text)
    
    return corpus

In [19]:
corpus = process_pdfs_in_folder('./EN-His/')

In [25]:
corpus[0]

'height and infant mortality are both considered health indicatorsofapopulation,yettheytendtobemuchmore strongly correlated in high-income, low-mortality pop- ulations. this article shows that infant deaths are not representative of the health of survivors as it relates to height because breastfeeding practices shield them from part of the disease environment. instead, child mortality rates, especially from food and waterborne diseases, cap- turethediseaseloadthatisassociatedwithlowerheights better.theperiodofthisstudyis1850–1940,withafocuson 1875–1900,asthenetherlandsunderwentmajorhealthand wealth transitions. individual conscription heights from thehistoricalsampleofthenetherlandsaswellasmunic- ipal conscription statistics are used. the article takes a diachronic approach to examine how various health indi- cators have developed over time. the start of the upward trend in heights and the improvement of child mortal- ity rates coincided in four dutch regions, whereas infant mortality 

In [20]:
len(corpus)

25

TOPIC MODELLING

In [23]:
import nltk
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmatized_tokens

processed_docs = [preprocess_text(doc) for doc in corpus]

dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

lda_model = gensim.models.LdaMulticore(corpus, num_topics=2, id2word=dictionary, passes=10)

topics = lda_model.print_topics(num_words=7)
for topic in topics:
    print(topic)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject