# IMPORTS

In [1]:
# from grobid_client.grobid_client import GrobidClient
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import re
import sys
import gensim
from gensim import corpora, models, similarities
import os
from stop_words import get_stop_words
# from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
import numpy as np
# Now let's do topic modeling using LDA
from sklearn.decomposition import LatentDirichletAllocation


# si no hace bien los imports de utilsdescomenta esta linea
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)

from utils import remove_files, get_abstract, cosine

# GET papers

In [15]:
# Directorio donde se encuentran los archivos XML
xml_dir = os.path.join(parent_dir, "output")
# papers_dir = os.path.join(parent_dir, "papers")

# remove_files(xml_dir)

# client = GrobidClient(config_path="./config.json")
# client.process("processFulltextDocument", papers_dir, output=xml_dir, consolidate_citations=True, tei_coordinates=True, n=20)

# Lista para almacenar los resúmenes
papers = {}

# Procesar cada archivo XML en el directorio
for file in os.listdir(xml_dir):
    if file.endswith(".xml"):  # Verificar que el archivo sea XML
        file_path = os.path.join(xml_dir, file)
        file_name = os.path.basename(file_path)[:-15]
        tree = ET.parse(file_path)
        root = tree.getroot()
        #verificamos si el paper esta en nuestro diccionario y si no lo esta creamos una entrada para el
        if file_name not in papers:
            papers[file_name] = {}

        #get abstract
        abstract = get_abstract(root)
        papers[file_name]['abstract'] = abstract

        #get authors
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        autores = root.findall('.//tei:author', ns)

        # Obtener los nombres completos de los autores
        nombres_completos = []
        for autor in autores:
            # Encontrar el elemento forename dentro de author
            forename_elem = autor.find('.//tei:forename', ns)
            # Encontrar el elemento surname dentro de author
            surname_elem = autor.find('.//tei:surname', ns)
            
            # Verificar si se encontraron los elementos forename y surname
            try:
                # Obtener el texto de los elementos forename y surname
                nombre = forename_elem.text
                apellido = surname_elem.text
                nombre_completo = f"{nombre} {apellido}"
            except:
                # Si no se encontraron los elementos, asignar NA a nombre y apellido
                # print('Missing name and/or surname. Cant get full name.')
                pass
            nombres_completos.append(nombre_completo)
        papers[file_name]['authors'] = nombres_completos


print(f'Numero de documentos: {len(papers)}')

Numero de documentos: 19


In [16]:
papers

{'11621ijccsa02': {'abstract': 'With recent advances in technology, internet has drastically changed the computing world from the concept of parallel computing to distributed computing to grid computing and now to cloud computing. The evolution of cloud computing over the past few years is potentially one of the major advances in the history of computing. Unfortunately, many banks are still hesitant to adopt cloud technology. New credit unions wanting to achieve greater business agility, cloud technology enables organizations to respond instantly to changing market conditions, leveraging data and applied analytics to achieve customer experience and operational productivity benefits. As a result, cloud computing comes in to provide a solution to such challenges making banking a reliable and trustworthy service. This paper aims at cloud computing strategy, impact in banking and financial institutions and discusses the significant reliance of cloud computing.',
  'authors': ['Prudhvi Parn

# SIMILARITY


## TFIDF

In [13]:
textos = [resumen.split() for resumen in papers.values()]

diccionario = corpora.Dictionary(textos)

corpus = [diccionario.doc2bow(texto) for texto in textos]

tfidf = models.TfidfModel(corpus)

index = similarities.MatrixSimilarity(tfidf[corpus])

for i in range(len(textos)):
    for j in range(i + 1, len(textos)):
        vec_i = diccionario.doc2bow(textos[i])
        vec_j = diccionario.doc2bow(textos[j])
        sim_ij = index[tfidf[vec_i]][j]
        print(
            f"La similitud entre el documento {list(papers.keys())[i]} y el documento {list(papers.keys())[j]} es sim {sim_ij}"
        )

La similitud entre el documento 11621ijccsa02 y el documento 1709.01907 es sim 0.022268354892730713
La similitud entre el documento 11621ijccsa02 y el documento 1802.05799 es sim 0.018618298694491386
La similitud entre el documento 11621ijccsa02 y el documento 2007.03051 es sim 0.0440535768866539
La similitud entre el documento 11621ijccsa02 y el documento 208 es sim 0.045304201543331146
La similitud entre el documento 11621ijccsa02 y el documento 269 An Insight into Cloud Computing Paradigm and Services es sim 0.21600942313671112
La similitud entre el documento 11621ijccsa02 y el documento 6114nsa03 es sim 0.25234338641166687
La similitud entre el documento 11621ijccsa02 y el documento 9-12 es sim 0.051240257918834686
La similitud entre el documento 11621ijccsa02 y el documento 907-Article Text-2692-1-10-20230720 es sim 0.01917664147913456
La similitud entre el documento 11621ijccsa02 y el documento CBIR USING FEATURES DERIVED BY DEEP LEARNING es sim 0.010931157507002354
La similitud 

## BERT

In [14]:
# USING TRANSFORMERS

# If we want to improve the similarity and use a word embeddings approach, we may use sentence transformers. This may take a while:
sbert_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
sentence_embeddings = sbert_model.encode(list(papers.values()))
for i in range(len(sentence_embeddings)):
    for j in range(i + 1, len(sentence_embeddings)):
        sim = cosine(sentence_embeddings[i], sentence_embeddings[j])
        print(
            f"La similitud entre el documento {list(papers.keys())[i]} y el documento {list(papers.keys())[j]} es sim {sim}"
        )



La similitud entre el documento 11621ijccsa02 y el documento 1709.01907 es sim 0.18909263610839844
La similitud entre el documento 11621ijccsa02 y el documento 1802.05799 es sim 0.32246485352516174
La similitud entre el documento 11621ijccsa02 y el documento 2007.03051 es sim 0.38316184282302856
La similitud entre el documento 11621ijccsa02 y el documento 208 es sim 0.37224164605140686
La similitud entre el documento 11621ijccsa02 y el documento 269 An Insight into Cloud Computing Paradigm and Services es sim 0.6741066575050354
La similitud entre el documento 11621ijccsa02 y el documento 6114nsa03 es sim 0.7781059741973877
La similitud entre el documento 11621ijccsa02 y el documento 9-12 es sim 0.4558578133583069
La similitud entre el documento 11621ijccsa02 y el documento 907-Article Text-2692-1-10-20230720 es sim 0.33682113885879517
La similitud entre el documento 11621ijccsa02 y el documento CBIR USING FEATURES DERIVED BY DEEP LEARNING es sim 0.12774041295051575
La similitud entre e

## KNN

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list(papers.values()))


In [16]:
cos_sim_matrix = cosine_similarity(X)

In [17]:
clustering = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
labels = clustering.fit_predict(cos_sim_matrix)

#kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300)
#labels = kmeans.fit_predict(cos_sim_matrix)

#dbscan = DBSCAN(eps=0.1, min_samples=2, metric='precomputed')
#labels = dbscan.fit_predict(cos_sim_matrix)

# print the clusters
df = pd.DataFrame({'document': list(papers.values()), 'cluster': labels})
print(df)

                                             document  cluster
0   With recent advances in technology, internet h...        0
1   Reliable uncertainty estimation for time serie...        1
2   Training modern deep learning models requires ...        0
3   Deep learning (DL) can achieve impressive resu...        0
4   Artificial Intelligence (AI), sometimes called...        0
5   Cloud computing is a computing model which pro...        0
6   Cloud computing has formed the conceptual and ...        0
7   Artificial Intelligence is making a machine be...        0
8   As artificial intelligence (AI) technology bec...        0
9   In a Content Based Image Retrieval (CBIR) Syst...        1
10  Deep learning is a form of machine learning th...        0
11  My goal in this paper is twofold: to study how...        1
12  Cloud computing has revolutionized the way bus...        0
13  In this paper, we introduce a new vision-langu...        1
14  Methods that combine local and global features...  



# TOPIC MODELLING

## LDA

In [18]:
# let's do a countvectorizer now. This is different from TF-IDF
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(list(papers.values()))
# we are only creating 2 topics
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

In [19]:
feature_names = count_vectorizer.get_feature_names_out()
for topic_id, topic in enumerate(lda.components_):
    print(f"Topic {topic_id}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))

Topic 0:
the of and to on
Topic 1:
the and cloud computing of


In [20]:
#Now let's see the probability of one of the sentence to belong to each topic

for name, abstract in papers.items():
    new_doc_bow = count_vectorizer.transform([abstract])
# Compute the topic distribution for the new document
    topic_distribution = lda.transform(new_doc_bow)
    print(f"Topic distribution for {name}:")
    for topic_idx, topic_prob in enumerate(topic_distribution[0]):
        print(f"Topic {topic_idx}: {topic_prob:.4f}")

Topic distribution for 11621ijccsa02:
Topic 0: 0.0045
Topic 1: 0.9955
Topic distribution for 1709.01907:
Topic 0: 0.9952
Topic 1: 0.0048
Topic distribution for 1802.05799:
Topic 0: 0.9937
Topic 1: 0.0063
Topic distribution for 2007.03051:
Topic 0: 0.9608
Topic 1: 0.0392
Topic distribution for 208:
Topic 0: 0.0037
Topic 1: 0.9963
Topic distribution for 269 An Insight into Cloud Computing Paradigm and Services:
Topic 0: 0.0108
Topic 1: 0.9892
Topic distribution for 6114nsa03:
Topic 0: 0.0036
Topic 1: 0.9964
Topic distribution for 9-12:
Topic 0: 0.0041
Topic 1: 0.9959
Topic distribution for 907-Article Text-2692-1-10-20230720:
Topic 0: 0.0062
Topic 1: 0.9938
Topic distribution for CBIR USING FEATURES DERIVED BY DEEP LEARNING:
Topic 0: 0.9962
Topic 1: 0.0038
Topic distribution for hir-22-351:
Topic 0: 0.9913
Topic 1: 0.0087
Topic distribution for How good are deep models in understanding generated images:
Topic 0: 0.9964
Topic 1: 0.0036
Topic distribution for IJISRT23AUG773:
Topic 0: 0.003

In [21]:
len(feature_names)

956

In [22]:
topic_distribution[0]

array([0.99538326, 0.00461674])

## LDA 2

In [23]:
from gensim.models import LdaModel
from gensim.models import LdaMulticore
preprocessed_documents = []
for document in papers.values():
    tokens = vectorizer.get_feature_names_out()
    preprocessed_documents.append(tokens)

#print(tokens)

dictionary = gensim.corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary, passes=10)
coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_npmi')
coherence_score = coherence_model.get_coherence()
print(f"Coherence score: {coherence_score:.2f}")

Coherence score: -0.66


In [24]:
for topic_id, topic_words in lda_model.print_topics(num_words=10):
    print(f"Topic {topic_id}: {topic_words}")

Topic 0: 0.001*"becomes" + 0.001*"estimation" + 0.001*"recurrent" + 0.001*"fully" + 0.001*"background" + 0.001*"computations" + 0.001*"much" + 0.001*"instantly" + 0.001*"without" + 0.001*"conduct"
Topic 1: 0.002*"outperform" + 0.001*"resulted" + 0.001*"collected" + 0.001*"dependencies" + 0.001*"adoption" + 0.001*"taken" + 0.001*"digitalized" + 0.001*"similarity" + 0.001*"tasks" + 0.001*"testing"


## LDA 3

In [25]:
# TOPIC MODELLING

stop_words = get_stop_words("english")
keywords = [
    [
        word
        for word in resumen.lower().split()
        if word.isalpha() and word not in stop_words
    ]
    for resumen in papers.values()
]
dictionary = corpora.Dictionary(keywords)
doc_term_matrix = [dictionary.doc2bow(title) for title in keywords]

LDA = gensim.models.ldamodel.LdaModel

lda_model = LDA(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=7,
    random_state=100,
    chunksize=1000,
    passes=50,
)

temas = lda_model.print_topics(num_words=5)
for tema in temas:
    print(tema)

(0, '0.054*"cloud" + 0.048*"computing" + 0.020*"big" + 0.016*"based" + 0.016*"will"')
(1, '0.031*"computing" + 0.031*"cloud" + 0.022*"training" + 0.014*"significant" + 0.009*"may"')
(2, '0.040*"artificial" + 0.040*"intelligence" + 0.025*"cloud" + 0.020*"human" + 0.020*"computing"')
(3, '0.024*"series" + 0.020*"time" + 0.014*"deep" + 0.014*"layers" + 0.010*"models"')
(4, '0.025*"visual" + 0.019*"maps" + 0.013*"solution" + 0.013*"tokens" + 0.013*"supervised"')
(5, '0.029*"features" + 0.017*"images" + 0.012*"large" + 0.012*"propose" + 0.012*"results"')
(6, '0.028*"model" + 0.021*"generated" + 0.018*"image" + 0.018*"images" + 0.018*"object"')
