In [None]:
import pandas as pd
import plotly.express as px

from tqdm import tqdm
import spacy
import joblib
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

en = spacy.load('en_core_web_sm')
tqdm.pandas()

In [None]:
try:
    df =  joblib.load("../../data/s2orc/s2orc_citations_filtered_with_mag_id_with_spacy.pkl")
except Exception:
    print("Processing...")
    df = pd.read_csv("../../data/s2orc/s2orc_citations_filtered_with_mag_id.csv", index_col=0)
    df = df[~df.abstract.isna()]
    df['doc'] = df.abstract.progress_apply(en)
    df['lemmas'] = df['doc'].apply(lambda doc: [t.lemma_ for t in doc if t.is_alpha if not t.is_stop if not t.is_punct])
    # joblib.dump(df, "../../data/s2orc/s2orc_citations_filtered_with_mag_id_with_spacy.pkl")

In [None]:
df['lemmas'] = df['doc'].apply(lambda d: [t.lemma_ for t in d if not t.is_stop if t.is_alpha if not t.is_entity])

df['tokens_len'] = df['lemmas'].str.len()

In [None]:
px.histogram(df['tokens_len'], log_y=True)

In [None]:
dictionary = Dictionary(df['lemmas'])
encoded_docs = df['lemmas'].apply(dictionary.doc2bow)

In [None]:
df['lemmas'].iloc[2]

In [None]:
lda = LdaMulticore(encoded_docs, num_topics=10)

In [None]:
vis = pyLDAvis.gensim_models.prepare(lda, encldaoded_docs, dictionary=dictionary)
vis

In [None]:
vis.to_html

In [None]:
from gensim.models import CoherenceModel

In [None]:
models = []
for topics_number in range(5, 21, 5):
    lda = LdaMulticore(encoded_docs, num_topics=topics_number)
    models.append(lda)

In [None]:
cvs = []
for model in models:
    cm = CoherenceModel(model,texts=df['lemmas'], dictionary=dictionary)
    c_v = cm.get_coherence()
    cvs.append(c_v)

In [None]:
import plotly.express as px


px.line(x=range(5, 21, 5), y=cvs)

In [None]:
models = []
for topics_number in range(5, 11, 1):
    lda = LdaMulticore(encoded_docs, num_topics=topics_number)
    models.append(lda)
cvs = []
for model in models:
    cm = CoherenceModel(model,texts=df['lemmas'], dictionary=dictionary)
    c_v = cm.get_coherence()
    cvs.append(c_v)
    

px.line(x=range(5, 11, 1), y=cvs)

In [None]:
# Excercise: write grid search for LDA model (searching parameters alpha, beta, offset, and topics_n) on the data used in the previous exercise

In [None]:
lda = LdaMulticore(encoded_docs, num_topics=8, passes=10)

In [None]:
vis = pyLDAvis.gensim_models.prepare(lda, encoded_docs, dictionary=dictionary)
vis

In [None]:
with open('../../data/s2orc/abstract_topics.html', 'w') as f:
    pyLDAvis.save_html(vis,f)

In [None]:
from bertopic import BERTopic



topic_model = BERTopic(language='en')
# topics, probs = topic_model.fit_transform(docs)


In [None]:
topic_model.fit(list(df['abstract']))

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_hierarchy(width=2000, height=1400)

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()