# IMPORTS

In [1]:
from grobid_client.grobid_client import GrobidClient
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import re
import sys
import gensim
from gensim import corpora, models, similarities
import os
from stop_words import get_stop_words
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
import numpy as np
# Now let's do topic modeling using LDA
from sklearn.decomposition import LatentDirichletAllocation


# si no hace bien los imports de utilsdescomenta esta linea
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from utils import remove_files, get_abstract, cosine

# GET ABSTRACTS

In [2]:
# Directorio donde se encuentran los archivos XML
xml_dir = os.path.join(parent_dir, "output")
papers_dir = os.path.join(parent_dir, "papers")

remove_files(xml_dir)

client = GrobidClient(config_path="./config.json")
client.process("processFulltextDocument", papers_dir, output=xml_dir, consolidate_citations=True, tei_coordinates=True, n=20)

# Lista para almacenar los resúmenes
abstracts = {}

# Procesar cada archivo XML en el directorio
for file in os.listdir(xml_dir):
    if file.endswith(".xml"):  # Verificar que el archivo sea XML
        file_path = os.path.join(xml_dir, file)
        tree = ET.parse(file_path)
        root = tree.getroot()
        abstract = get_abstract(root)
        file_name = os.path.basename(file_path)[:-15]
        abstracts[file_name] = abstract

print(len(abstracts))

In [None]:
xml_dir = os.path.join(parent_dir, "output")
papers_dir = os.path.join(parent_dir, "papers")


In [None]:
papers_dir

'c:\\Users\\nicov\\Documents\\Github\\OpenScience\\papers'

In [None]:
xml_dir

'c:\\Users\\nicov\\Documents\\Github\\OpenScience\\output'

# SIMILARITY


## TFIDF

In [None]:
textos = [resumen.split() for resumen in abstracts.values()]

diccionario = corpora.Dictionary(textos)

corpus = [diccionario.doc2bow(texto) for texto in textos]

tfidf = models.TfidfModel(corpus)

index = similarities.MatrixSimilarity(tfidf[corpus])

for i in range(len(textos)):
    for j in range(i + 1, len(textos)):
        vec_i = diccionario.doc2bow(textos[i])
        vec_j = diccionario.doc2bow(textos[j])
        sim_ij = index[tfidf[vec_i]][j]
        print(
            f"La similitud entre el documento {list(abstracts.keys())[i]} y el documento {list(abstracts.keys())[j]} es sim {sim_ij}"
        )

La similitud entre el documento 11621ijccsa02 y el documento 1802.05799 es sim 0.015940522775053978
La similitud entre el documento 11621ijccsa02 y el documento 269 An Insight into Cloud Computing Paradigm and Services es sim 0.1687125712633133
La similitud entre el documento 11621ijccsa02 y el documento 9-12 es sim 0.0468757227063179
La similitud entre el documento 11621ijccsa02 y el documento hir-22-351 es sim 0.030683305114507675
La similitud entre el documento 11621ijccsa02 y el documento IJISRT23AUG773 es sim 0.15756084024906158
La similitud entre el documento 11621ijccsa02 y el documento Paper11879 es sim 0.1512293517589569
La similitud entre el documento 11621ijccsa02 y el documento Sketch Based Image Retrieval for Architecture es sim 0.015656426548957825
La similitud entre el documento 11621ijccsa02 y el documento VISION TRANSFORMERS NEED REGISTERS es sim 0.016979750245809555
La similitud entre el documento 1802.05799 y el documento 269 An Insight into Cloud Computing Paradigm 

## BERT

In [None]:
# USING TRANSFORMERS

# If we want to improve the similarity and use a word embeddings approach, we may use sentence transformers. This may take a while:
sbert_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
sentence_embeddings = sbert_model.encode(list(abstracts.values()))
for i in range(len(sentence_embeddings)):
    for j in range(i + 1, len(sentence_embeddings)):
        sim = cosine(sentence_embeddings[i], sentence_embeddings[j])
        print(
            f"La similitud entre el documento {list(abstracts.keys())[i]} y el documento {list(abstracts.keys())[j]} es sim {sim}"
        )



model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

La similitud entre el documento 11621ijccsa02 y el documento 1802.05799 es sim 0.32246485352516174
La similitud entre el documento 11621ijccsa02 y el documento 269 An Insight into Cloud Computing Paradigm and Services es sim 0.6741066575050354
La similitud entre el documento 11621ijccsa02 y el documento 9-12 es sim 0.4558578133583069
La similitud entre el documento 11621ijccsa02 y el documento hir-22-351 es sim 0.3331562876701355
La similitud entre el documento 11621ijccsa02 y el documento IJISRT23AUG773 es sim 0.7527927160263062
La similitud entre el documento 11621ijccsa02 y el documento Paper11879 es sim 0.6483190059661865
La similitud entre el documento 11621ijccsa02 y el documento Sketch Based Image Retrieval for Architecture es sim 0.10967686772346497
La similitud entre el documento 11621ijccsa02 y el documento VISION TRANSFORMERS NEED REGISTERS es sim 0.06570909917354584
La similitud entre el documento 1802.05799 y el documento 269 An Insight into Cloud Computing Paradigm and Se

## KNN

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list(abstracts.values()))


In [None]:
cos_sim_matrix = cosine_similarity(X)

In [None]:
clustering = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
labels = clustering.fit_predict(cos_sim_matrix)

#kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300)
#labels = kmeans.fit_predict(cos_sim_matrix)

#dbscan = DBSCAN(eps=0.1, min_samples=2, metric='precomputed')
#labels = dbscan.fit_predict(cos_sim_matrix)

# print the clusters
df = pd.DataFrame({'document': list(abstracts.values()), 'cluster': labels})
print(df)

                                            document  cluster
0  With recent advances in technology, internet h...        0
1  Training modern deep learning models requires ...        1
2  Cloud computing is a computing model which pro...        0
3  Artificial Intelligence is making a machine be...        0
4  Deep learning is a form of machine learning th...        0
5  Cloud computing has revolutionized the way bus...        0
6  Cloud computing has had a significant impact o...        0
7  Sketch-based image retrieval (SBIR) is an imag...        0
8  Transformers have recently emerged as a powerf...        0




# TOPIC MODELLING

## LDA

In [None]:
# let's do a countvectorizer now. This is different from TF-IDF
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(list(abstracts.values()))
# we are only creating 2 topics
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

In [None]:
feature_names = count_vectorizer.get_feature_names_out()
for topic_id, topic in enumerate(lda.components_):
    print(f"Topic {topic_id}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))

Topic 0:
the and data of for
Topic 1:
the and to computing of


In [None]:
#Now let's see the probability of one of the sentence to belong to each topic

for name, abstract in abstracts.items():
    new_doc_bow = count_vectorizer.transform([abstract])
# Compute the topic distribution for the new document
    topic_distribution = lda.transform(new_doc_bow)
    print(f"Topic distribution for {name}:")
    for topic_idx, topic_prob in enumerate(topic_distribution[0]):
        print(f"Topic {topic_idx}: {topic_prob:.4f}")

Topic distribution for 11621ijccsa02:
Topic 0: 0.0050
Topic 1: 0.9950
Topic distribution for 1802.05799:
Topic 0: 0.9933
Topic 1: 0.0067
Topic distribution for 269 An Insight into Cloud Computing Paradigm and Services:
Topic 0: 0.0097
Topic 1: 0.9903
Topic distribution for 9-12:
Topic 0: 0.9967
Topic 1: 0.0033
Topic distribution for hir-22-351:
Topic 0: 0.0072
Topic 1: 0.9928
Topic distribution for IJISRT23AUG773:
Topic 0: 0.9959
Topic 1: 0.0041
Topic distribution for Paper11879:
Topic 0: 0.0027
Topic 1: 0.9973
Topic distribution for Sketch Based Image Retrieval for Architecture:
Topic 0: 0.0044
Topic 1: 0.9956
Topic distribution for VISION TRANSFORMERS NEED REGISTERS:
Topic 0: 0.0046
Topic 1: 0.9954


In [None]:
len(feature_names)

506

In [None]:
topic_distribution[0]

array([0.004632, 0.995368])

## LDA 2

In [None]:
from gensim.models import LdaModel
from gensim.models import LdaMulticore
preprocessed_documents = []
for document in abstracts.values():
    tokens = vectorizer.get_feature_names_out()
    preprocessed_documents.append(tokens)

#print(tokens)

dictionary = gensim.corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)
coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_npmi')
coherence_score = coherence_model.get_coherence()
print(f"Coherence score: {coherence_score:.2f}")

Coherence score: -0.66


In [None]:
for topic_id, topic_words in lda_model.print_topics(num_words=10):
    print(f"Topic {topic_id}: {topic_words}")

Topic 0: 0.002*"developments" + 0.002*"provided" + 0.002*"adoption" + 0.002*"implementation" + 0.002*"this" + 0.002*"capabilities" + 0.002*"also" + 0.002*"business" + 0.002*"time" + 0.002*"customer"
Topic 1: 0.002*"some" + 0.002*"matching" + 0.002*"terms" + 0.002*"significant" + 0.002*"api" + 0.002*"while" + 0.002*"changing" + 0.002*"despite" + 0.002*"lot" + 0.002*"vision"


## LDA 3

In [None]:
# TOPIC MODELLING

stop_words = get_stop_words("english")
keywords = [
    [
        word
        for word in resumen.lower().split()
        if word.isalpha() and word not in stop_words
    ]
    for resumen in abstracts.values()
]
dictionary = corpora.Dictionary(keywords)
doc_term_matrix = [dictionary.doc2bow(title) for title in keywords]

LDA = gensim.models.ldamodel.LdaModel

lda_model = LDA(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=7,
    random_state=100,
    chunksize=1000,
    passes=50,
)

temas = lda_model.print_topics(num_words=5)
for tema in temas:
    print(tema)

(0, '0.035*"cloud" + 0.029*"computing" + 0.026*"smbs" + 0.023*"intelligence" + 0.023*"artificial"')
(1, '0.021*"image" + 0.021*"visual" + 0.016*"methods" + 0.016*"network" + 0.016*"retrieval"')
(2, '0.003*"artificial" + 0.003*"intelligence" + 0.003*"computing" + 0.003*"cloud" + 0.003*"may"')
(3, '0.003*"visual" + 0.003*"image" + 0.003*"architecture" + 0.003*"network" + 0.003*"methods"')
(4, '0.003*"computing" + 0.003*"artificial" + 0.003*"cloud" + 0.003*"intelligence" + 0.003*"will"')
(5, '0.053*"cloud" + 0.053*"computing" + 0.024*"training" + 0.015*"significant" + 0.010*"can"')
(6, '0.080*"computing" + 0.073*"cloud" + 0.067*"big" + 0.040*"will" + 0.034*"data"')
