# IMPORTS

In [30]:
# from grobid_client.grobid_client import GrobidClient
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import re
import sys
import gensim
from gensim import corpora, models, similarities
import os
from stop_words import get_stop_words
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
import numpy as np
# Now let's do topic modeling using LDA
from sklearn.decomposition import LatentDirichletAllocation


# si no hace bien los imports de utilsdescomenta esta linea
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from utils import remove_files, get_abstract, cosine

# GET ABSTRACTS

In [4]:
# Directorio donde se encuentran los archivos XML
xml_dir = os.path.join(parent_dir, "output")
# remove_files(xml_dir)

# client = GrobidClient(config_path="code_KG/config.json")
# client.process("processFulltextDocument", "./papers", output="./output/", consolidate_citations=True, tei_coordinates=True, n=20)

# Lista para almacenar los resúmenes
abstracts = []

# Procesar cada archivo XML en el directorio
for file in os.listdir(xml_dir):
    if file.endswith(".xml"):  # Verificar que el archivo sea XML
        file_path = os.path.join(xml_dir, file)
        tree = ET.parse(file_path)
        root = tree.getroot()
        abstract = get_abstract(root)
        abstracts.append(abstract)

# print(abstracts)

# SIMILARITY


## TFIDF

In [5]:
textos = [resumen.split() for resumen in abstracts]

diccionario = corpora.Dictionary(textos)

corpus = [diccionario.doc2bow(texto) for texto in textos]

tfidf = models.TfidfModel(corpus)

index = similarities.MatrixSimilarity(tfidf[corpus])

for i in range(len(textos)):
    for j in range(i + 1, len(textos)):
        vec_i = diccionario.doc2bow(textos[i])
        vec_j = diccionario.doc2bow(textos[j])
        sim_ij = index[tfidf[vec_i]][j]
        print(
            f"La similitud entre el documento {i+1} y el documento {j+1} es sim {sim_ij}"
        )

La similitud entre el documento 1 y el documento 2 es sim 0.002790855709463358
La similitud entre el documento 1 y el documento 3 es sim 0.028237465769052505
La similitud entre el documento 1 y el documento 4 es sim 0.00998421385884285
La similitud entre el documento 2 y el documento 3 es sim 0.07665908336639404
La similitud entre el documento 2 y el documento 4 es sim 0.02041800133883953
La similitud entre el documento 3 y el documento 4 es sim 0.019381240010261536


## BERT

In [6]:
# USING TRANSFORMERS

# If we want to improve the similarity and use a word embeddings approach, we may use sentence transformers. This may take a while:
sbert_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
sentence_embeddings = sbert_model.encode(abstracts)
for i in range(len(sentence_embeddings)):
    for j in range(i + 1, len(sentence_embeddings)):
        sim = cosine(sentence_embeddings[i], sentence_embeddings[j])
        print(
            f"La similitud entre el documento {i+1} y el documento {j+1} es sim {sim}"
        )

La similitud entre el documento 1 y el documento 2 es sim 0.2755492925643921
La similitud entre el documento 1 y el documento 3 es sim 0.3994024991989136
La similitud entre el documento 1 y el documento 4 es sim 0.21654930710792542
La similitud entre el documento 2 y el documento 3 es sim 0.6693546772003174
La similitud entre el documento 2 y el documento 4 es sim 0.6281082630157471
La similitud entre el documento 3 y el documento 4 es sim 0.6317006945610046


## KNN

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(abstracts)


In [20]:
cos_sim_matrix = cosine_similarity(X)

In [22]:
clustering = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
labels = clustering.fit_predict(cos_sim_matrix)

#kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300)
#labels = kmeans.fit_predict(cos_sim_matrix)

#dbscan = DBSCAN(eps=0.1, min_samples=2, metric='precomputed')
#labels = dbscan.fit_predict(cos_sim_matrix)

# print the clusters
df = pd.DataFrame({'document': abstracts, 'cluster': labels})
print(df)

                                            document  cluster
0  Reliable uncertainty estimation for time serie...        1
1  Training modern deep learning models requires ...        0
2  Deep learning (DL) can achieve impressive resu...        0
3  Deep learning is a form of machine learning th...        0




# TOPIC MODELLING

## LDA

In [27]:
# let's do a countvectorizer now. This is different from TF-IDF
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(abstracts)
# we are only creating 2 topics
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(X)

In [28]:
feature_names = count_vectorizer.get_feature_names_out()
for topic_id, topic in enumerate(lda.components_):
    print(f"Topic {topic_id}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))

Topic 0:
deep and the to of
Topic 1:
the training communication gpu to
Topic 2:
time series of to for
Topic 3:
of the to computer deep
Topic 4:
and this energy of in


In [29]:
#Now let's see the probability of one of the sentence to belong to each topic
new_doc_bow = count_vectorizer.transform(["This abstract deals with semantic web and eScience"])
#new_doc_bow = vectorizer.transform(["This other paper is represents scientific advances in fruits. Specifically tomatoes and strawberries"])

# Compute the topic distribution for the new document
topic_distribution = lda.transform(new_doc_bow)

print("Topic distribution for the new document:")
for topic_idx, topic_prob in enumerate(topic_distribution[0]):
    print(f"Topic {topic_idx}: {topic_prob:.2f}")

Topic distribution for the new document:
Topic 0: 0.05
Topic 1: 0.05
Topic 2: 0.36
Topic 3: 0.05
Topic 4: 0.49


## LDA 2

In [32]:
from gensim.models import LdaModel
from gensim.models import LdaMulticore
preprocessed_documents = []
for document in abstracts:
    tokens = vectorizer.get_feature_names_out()
    preprocessed_documents.append(tokens)

#print(tokens)

dictionary = gensim.corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)
coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_npmi')
coherence_score = coherence_model.get_coherence()
print(f"Coherence score: {coherence_score:.2f}")

Coherence score: -0.65


In [33]:
for topic_id, topic_words in lda_model.print_topics(num_words=10):
    print(f"Topic {topic_id}: {topic_words}")

Topic 0: 0.004*"their" + 0.004*"real" + 0.004*"overhead" + 0.004*"present" + 0.004*"library" + 0.004*"entail" + 0.004*"terms" + 0.004*"form" + 0.004*"years" + 0.004*"to"
Topic 1: 0.004*"of" + 0.004*"learning" + 0.004*"intensive" + 0.004*"probabilistic" + 0.004*"need" + 0.004*"solution" + 0.004*"millions" + 0.004*"series" + 0.004*"concepts" + 0.004*"specialized"


## LDA 3

In [None]:
# TOPIC MODELLING

stop_words = get_stop_words("english")
keywords = [
    [
        word
        for word in resumen.lower().split()
        if word.isalpha() and word not in stop_words
    ]
    for resumen in abstracts
]
dictionary = corpora.Dictionary(keywords)
doc_term_matrix = [dictionary.doc2bow(title) for title in keywords]

LDA = gensim.models.ldamodel.LdaModel

lda_model = LDA(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=7,
    random_state=100,
    chunksize=1000,
    passes=50,
)

temas = lda_model.print_topics(num_words=5)
for tema in temas:
    print(tema)

(0, '0.006*"series" + 0.006*"time" + 0.006*"uncertainty" + 0.006*"models" + 0.006*"prediction"')
(1, '0.006*"series" + 0.006*"time" + 0.006*"prediction" + 0.006*"anomaly" + 0.006*"used"')
(2, '0.006*"time" + 0.006*"training" + 0.006*"series" + 0.006*"models" + 0.006*"deep"')
(3, '0.053*"series" + 0.053*"time" + 0.032*"prediction" + 0.032*"uncertainty" + 0.022*"models"')
(4, '0.031*"training" + 0.031*"carbon" + 0.031*"energy" + 0.021*"deep" + 0.021*"may"')
(5, '0.006*"training" + 0.006*"series" + 0.006*"time" + 0.006*"models" + 0.006*"often"')
(6, '0.043*"training" + 0.026*"deep" + 0.026*"computer" + 0.026*"learning" + 0.018*"significant"')
