In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:

import numpy as np
import pandas as pd

In [5]:
data_path = os.path.join(module_path, 'Data', 'artigos.csv')
df = pd.read_csv(data_path)

df = pd.read_csv(data_path, delimiter=',')
df.columns = [col.strip().replace('"', '') for col in df.columns]

df.head()


Unnamed: 0,ID,Title,Abstract,Keywords,Authors,Year,ISD,DOI,Links,Publication Type,Search Engine,VALIDAÇÃO,PDF
0,1535,Reconstructing Missing EHRs Using Time-Aware W...,Real-world Electronic Health Records (EHRs) ar...,"Electronic Health Records(EHRs), EHRs Imputati...",G. Gao; F. Khoshnevisan; M. Chi,2022,2575-2634,10.1109/ICHI54592.2022.00034,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...,IEEE Conferences,IEEE,AC,2203.08245v2.pdf
1,16,3D-MICE: integration of cross-sectional and lo...,A key challenge in clinical data mining is tha...,"machine learning, imputation, missing data, el...",Baron JM,2018,1527-974X,10.1093/jamia/ocx133,https://pubmed.ncbi.nlm.nih.gov/29202205/,"Journal Article Research Support, Non-U.S. Gov't",PubMed,AC,3D-MICE.pdf
2,1385,Performance evaluation of a recurrent deep neu...,Atmospheric pollution refers to the presence o...,,Pedraza-Ortega JC,2022,2162-2906,10.1080/10962247.2022.2095057,https://pubmed.ncbi.nlm.nih.gov/35816429/,Journal Article,PubMed,AC,Performance evaluation of a recurrent deep neu...
3,1433,Predicting progression of Alzheimer's disease ...,"If left untreated, Alzheimer's disease (AD) is...","Alzheimer’s progression, MRI biomarker forecas...",Pant S,2022,1879-2782,S0893-6080(22)00094-6,https://pubmed.ncbi.nlm.nih.gov/35364417/,Journal Article,PubMed,AC,1-s2.0-S0893608022000946-main.pdf
4,369,Causes and Consequences of Missing Health-Rela...,Missing health-related quality of life (HRQOL)...,"longitudinal studies , quality of life , regis...",Spertus JA,2017,1941-7705,10.1161/CIRCOUTCOMES.116.003268,https://pubmed.ncbi.nlm.nih.gov/29246883/,Comparative Study Journal Article Multicenter ...,PubMed,AC,grady2017.pdf


In [6]:
abstracts = df[['ID', 'Abstract']].values


In [7]:
abstracts[0]

array([1535,
       'Real-world Electronic Health Records (EHRs) are often plagued by a high rate of missing data. In our EHRs, for example, the missing rates can be as high as 90% for some features, with an average missing rate of around 70% across all features. We propose a Time-Aware Dual-Cross-Visit missing value imputation method, named TA-DualCV, which spontaneously leverages multivariate dependencies across features and longitudinal dependencies both within- and cross-visit to maximize the information extracted from limited observable records in EHRs. Specifically, TA-DualCV captures the latent structure of missing patterns across measurements of different features and it also considers the time continuity and capture the latent temporal missing patterns based on both time-steps and irregular time-intervals. TA-DualCV is evaluated using three large real-world EHRs on two types of tasks: an unsupervised imputation task by varying mask rates up to 90% and a supervised 24-hour earl

In [8]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import string

# Baixar os stopwords em português
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('portuguese')

def preprocess_text(text):
    # Remover pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Converter para minúsculas
    text = text.lower()
    # Remover stop words
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Pré-processar todos os abstracts
df['cleaned_abstract'] = df['Abstract'].apply(preprocess_text)

# Ver os abstracts limpos
print(df['cleaned_abstract'].head())

0    realworld electronic health records ehrs are o...
1    key challenge in clinical data mining is that ...
2    atmospheric pollution refers to the presence o...
3    if left untreated alzheimers disease ad is lea...
4    missing healthrelated quality of life hrqol da...
Name: cleaned_abstract, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\igorc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:

vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(df['cleaned_abstract'])  # Document-Term Matrix


print(vectorizer.get_feature_names_out())

['0007' '001' '00130015' ... 'york' 'zealand' 'zone']


In [10]:
from sklearn.decomposition import LatentDirichletAllocation


K = 5 


lda_model = LatentDirichletAllocation(n_components=K, random_state=42)
lda_model.fit(dtm)


for index, topic in enumerate(lda_model.components_):
    print(f"Tópico {index + 1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Tópico 1:
['we', 'missing', 'imputation', 'with', 'in', 'data', 'of', 'to', 'and', 'the']
Tópico 2:
['an', 'results', 'over', 'are', 'analyses', 'family', 'only', 'area', 'value', 'outperforms']
Tópico 3:
['mnar', 'factors', 'were', 'at', 'with', 'of', 'scores', 'missing', 'hrqol', 'and']
Tópico 4:
['model', 'with', 'values', 'imputation', 'in', 'to', 'missing', 'of', 'and', 'the']
Tópico 5:
['with', 'gender', 'we', 'nursing', 'data', 'in', 'to', 'of', 'and', 'the']


In [19]:
# Distribuição de tópicos por documento
doc_topic_dist = lda_model.transform(dtm)


topic_df = pd.DataFrame(doc_topic_dist, columns=[f"Tópico {i+1}" for i in range(K)])
topic_df['Documento'] = df['ID']  # Adicionar identificação do documento






topic_df


Unnamed: 0,Tópico 1,Tópico 2,Tópico 3,Tópico 4,Tópico 5,Documento
0,0.001194,0.00117,0.001175,0.995276,0.001186,1535
1,0.996461,0.000878,0.000885,0.000889,0.000888,16
2,0.997831,0.000538,0.00054,0.000545,0.000545,1385
3,0.000823,0.000803,0.000809,0.000819,0.996745,1433
4,0.086448,0.000837,0.911007,0.000853,0.000856,369
5,0.995325,0.001157,0.001165,0.001181,0.001172,375
6,0.000833,0.000817,0.00082,0.996702,0.000828,447
7,0.995388,0.001144,0.00115,0.001161,0.001158,1656
8,0.996654,0.00083,0.000834,0.000841,0.000841,718
9,0.001603,0.001575,0.00158,0.993646,0.001596,1282


In [None]:
topic_df.to_csv('topicos.csv', index=False)

In [14]:
import pyLDAvis
import pyLDAvis.lda_model


# Visualizar os tópicos
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.lda_model.prepare(lda_model, dtm, vectorizer)
pyLDAvis.display(lda_vis)

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


In [15]:
pyLDAvis.save_html(lda_vis, 'lda_visualization.html')