In [None]:
%load_ext autoreload
%autoreload 2

from mair import papers_processing_utils
import mair

import pandas as pd
import matplotlib
import pickle
import re
import os
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy.lang.en
from collections import Counter
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from gensim.models import CoherenceModel
from gensim import corpora
from gensim import models
import gensim.summarization
from pprint import pprint
import pyLDAvis
from gensim.models import ldamodel

sns.set_theme()

PICKLED_PATH = "parserd_pdfs.pkl"
PDFS_PATH = "../data/FAT_20/"

In [None]:
if not os.path.exists(PICKLED_PATH):
    pdfs = mair.pdf_parsing.parse_all_files_from_path()
    with open(PICKLED_PATH, "wb+") as f:
        pickle.dump(pdfs, f)
else:
    with open(PICKLED_PATH, "rb") as f:
        pdfs = pickle.load(f)

# Cleaning

In [None]:
df = pd.DataFrame(pdfs)
df['pages_num']=df.pages.str.len()

In [None]:
df.pages_num.hist()

### Removing papers with only one page

In [None]:
df=df[df.pages_num!=1]
df.pages_num.hist()

### Removing references and bibliography

In [None]:
def clean_text(text):
    position = text.lower().rfind("references")
    text = text[:position]  # removing bibliography
    text = text.replace("\n", " ")  # removing newlines
    text = re.sub("\[[^\[^\]]*\]", "", text)  # removing references
    return text

In [None]:
df['cleaned_text']=df.full_text.apply(papers_processing_utils.clean_text)

# Keywords

In [None]:
en=spacy.load('en_core_web_lg')

In [None]:
en = spacy.lang.en.English()

In [None]:
tokenized = df.cleaned_text.apply(lambda x: en(x))

In [None]:
lemmas = tokenized.apply(
    lambda doc: 
        [word.lemma_.lower() for word in doc if not word.is_stop if word.is_alpha])

In [None]:
all_lemas = []
for l in lemmas:
    all_lemas+=l

In [None]:
unigrams = nltk.FreqDist(all_lemas)

In [None]:
plt.figure(figsize=(10,6))

unigrams.plot(30)

## TFIDF

In [None]:
dictionary = corpora.Dictionary(lemmas)

In [None]:
bow_lemmas = lemmas.apply(dictionary.doc2bow)

In [None]:
tfidf = models.TfidfModel(list(bow_lemmas))

In [None]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_lemmas], num_features=12)

#TODO

## Keywords

In [None]:
keywords = lemmas.apply(" ".join).apply(gensim.summarization.keywords).str.split('\n')

keywords

In [None]:
all_keywords = []
for k in keywords:
    all_keywords+=k

nltk.FreqDist(all_keywords).plot(20)

## LDA

In [None]:
lda = ldamodel.LdaModel(list(bow_lemmas), num_topics=5, id2word=dictionary)

pprint(lda.print_topics())

In [None]:
coherence_model_lda = CoherenceModel(model=lda, texts=list(lemmas), dictionary=dictionary, coherence='c_v')

coherence_model_lda.get_coherence()

In [None]:
grid = list(ParameterGrid({
    "num_topics": [2, 3, 4, 5, 6, 7, 8, 9],
    "alpha": list(np.arange(0.01, 1, 0.3)) + ["auto", "asymmetric"],
}))

models = []
scores = []

for params in tqdm(grid):
    lda_model = ldamodel.LdaModel(list(bow_lemmas),id2word=dictionary,**params)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=list(lemmas), dictionary=dictionary, coherence='c_v')
    scores.append(coherence_model_lda.get_coherence())
    models.append(lda_model)

In [None]:
best_id = np.argmax(scores)

In [None]:
scores[best_id]

In [None]:
models[best_id]

In [None]:
models[best_id].print_topics()