### Imports

In [3]:
import nltk
from nltk.corpus import reuters
import pandas as pd
import re

from nltk.stem.porter import PorterStemmer

from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from gensim.models.phrases import Phrases, Phraser

from nltk.corpus import stopwords
#import nltk
#nltk.download('stopwords')
import spacy

In [1]:
nltk.download('reuters')
files = reuters.fileids()
dados = []

for file_id in files:
    categorias = reuters.categories(file_id)
    texto = reuters.raw(file_id)
    dados.append({
        "file_id": file_id,
        "categorias": ", ".join(categorias),
        "texto": texto
    })

df = pd.DataFrame(dados)
df.to_csv("reuters.csv", index=False, encoding="utf-8")

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\decas\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [2]:
df_reuters = pd.read_csv('reuters.csv')
df_random_samples = pd.read_csv('random_reuters_samples.csv')

print(f"Number of rows in reuters.csv: {len(df_reuters)}")
print(f"Number of rows in random_reuters_samples.csv: {len(df_random_samples)}")

Number of rows in reuters.csv: 10788
Number of rows in random_reuters_samples.csv: 10


### Análise Exploratória

In [7]:
reuters = pd.read_csv('reuters.csv')

In [8]:
reuters.head()

Unnamed: 0,file_id,categorias,texto
0,test/14826,trade,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,grain,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"crude, nat-gas",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"corn, grain, rice, rubber, sugar, tin, trade",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"palm-oil, veg-oil",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


In [9]:
reuters.describe()

Unnamed: 0,file_id,categorias,texto
count,10788,10788,10788
unique,10788,468,10657
top,test/14826,earn,26-FEB-1987\n 26-FEB-1987\n\n
freq,1,3923,7


In [10]:
categories = set()

for item in reuters["categorias"]:
    
    parts = item.split(",")
    
    for p in parts:
        p_clean = p.strip()
        if p_clean != "":
            categories.add(p_clean)
print(len(categories))

90


In [11]:
categories

{'acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc'}

Pré-Processamento Simples

In [12]:
def preprocess_dataframe(df, text_column="texto"):
    df = df.copy()

    df[text_column] = (
        df[text_column]
        .astype(str)
        .str.lower()
        .apply(lambda x: re.sub(r"[^a-z\s]", " ", x))
        .apply(lambda x: re.sub(r"\s+", " ", x).strip())
    )

    return df

In [13]:
df_simple = preprocess_dataframe(reuters, text_column="texto")
df_simple.head()

Unnamed: 0,file_id,categorias,texto
0,test/14826,trade,asian exporters fear damage from u s japan rif...
1,test/14828,grain,china daily says vermin eat pct grain stocks a...
2,test/14829,"crude, nat-gas",japan to revise long term energy demand downwa...
3,test/14832,"corn, grain, rice, rubber, sugar, tin, trade",thai trade deficit widens in first quarter tha...
4,test/14833,"palm-oil, veg-oil",indonesia sees cpo price rising sharply indone...


Separação dos 10 Documentos

In [14]:
df_simple10 = df_simple.sample(10, random_state=42)
df_simple = df_simple.drop(df_simple10.index).reset_index(drop=True)

print("Linhas no df_simple (original):", len(df_simple))
print("Linhas no df_reserved (10 docs):", len(df_simple10))
df_simple10.head()

Linhas no df_simple (original): 10778
Linhas no df_reserved (10 docs): 10


Unnamed: 0,file_id,categorias,texto
4593,training/12421,earn,bayer world group pre tax profit billion marks...
8353,training/6220,earn,marcor lt maar expects fiscal year profit marc...
3614,training/10921,earn,computer microfilm corp lt comi year net shr c...
10382,training/9348,ship,iran says has better weapons than silkworm ira...
8048,training/5707,earn,transamerica income lt tai monthly dividend sh...


### String and tokenize

In [15]:
df_simple['texto'] = df_simple['texto'].astype(str)
documents = df_simple['texto'].apply(str.split).tolist()

### Dictionay and Bag-of-Words

In [16]:
dictionary = Dictionary(documents)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]

### Defs para Modelos LSA

In [18]:
def LSA_model(doc_term_matrix, num_topics = 90, dictionary = dictionary):
    lsi_model = LsiModel(doc_term_matrix, num_topics, id2word=dictionary)
    return lsi_model

### Model 1

#### LSA

In [20]:
lsi_model_1 = LSA_model(doc_term_matrix)

In [21]:
for i, topic in lsi_model_1.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

Tópico 0: 0.658*"the" + 0.326*"to" + 0.304*"of" + 0.251*"in" + 0.218*"a" + 0.217*"and" + 0.211*"said" + 0.146*"s" + 0.107*"for" + 0.081*"that"
Tópico 1: 0.581*"mln" + 0.512*"vs" + 0.280*"dlrs" + 0.221*"net" + 0.219*"cts" + -0.201*"the" + 0.173*"loss" + 0.137*"shr" + 0.102*"lt" + 0.100*"profit"
Tópico 2: 0.583*"in" + 0.387*"pct" + -0.309*"vs" + -0.301*"the" + 0.178*"billion" + 0.155*"from" + -0.150*"cts" + 0.131*"year" + 0.122*"a" + 0.118*"january"
Tópico 3: 0.459*"the" + -0.264*"to" + -0.260*"a" + -0.259*"said" + 0.240*"in" + -0.226*"it" + -0.218*"of" + -0.190*"dlrs" + 0.189*"vs" + -0.163*"its"
Tópico 4: -0.848*"nil" + -0.170*"o" + -0.162*"prev" + 0.162*"of" + -0.158*"wk" + -0.138*"e" + -0.127*"a" + 0.099*"dlrs" + -0.090*"s" + -0.083*"vs"
Tópico 5: 0.545*"to" + -0.401*"of" + 0.349*"s" + -0.267*"the" + 0.237*"u" + -0.215*"nil" + 0.167*"trade" + 0.114*"in" + -0.097*"a" + 0.096*"japan"
Tópico 6: 0.624*"mln" + -0.287*"vs" + -0.283*"cts" + -0.241*"pct" + -0.232*"a" + -0.194*"loss" + 0.161*"

In [22]:
coherence_model_lsa = CoherenceModel(model=lsi_model_1, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.2400080221907281


#### LDA

### Modelo 1.1 com TF-IDF

In [23]:
tfidf = TfidfModel(doc_term_matrix)
doc_term_tfidf = tfidf[doc_term_matrix]

#### LSA

In [26]:
lsi_model_1_1 = LSA_model(doc_term_tfidf)

In [27]:
for i, topic in lsi_model_1_1.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

Tópico 0: 0.609*"vs" + 0.308*"loss" + 0.304*"cts" + 0.281*"net" + 0.248*"mln" + 0.221*"shr" + 0.178*"profit" + 0.177*"revs" + 0.141*"qtr" + 0.121*"dlrs"
Tópico 1: 0.335*"the" + 0.171*"to" + 0.160*"pct" + 0.160*"in" + 0.141*"billion" + 0.140*"a" + 0.134*"s" + -0.133*"vs" + 0.128*"of" + 0.125*"said"
Tópico 2: 0.752*"loss" + 0.299*"profit" + -0.206*"vs" + -0.205*"cts" + -0.175*"div" + -0.174*"qtly" + -0.141*"record" + -0.127*"pay" + -0.123*"prior" + -0.122*"april"
Tópico 3: 0.329*"qtly" + 0.323*"loss" + 0.311*"div" + 0.269*"record" + 0.246*"prior" + 0.246*"pay" + 0.238*"cts" + 0.230*"april" + -0.209*"billion" + -0.197*"mln"
Tópico 4: -0.478*"billion" + -0.332*"stg" + -0.208*"bank" + -0.174*"money" + -0.146*"loss" + -0.144*"pct" + -0.126*"january" + -0.126*"february" + 0.125*"oper" + -0.119*"profit"
Tópico 5: -0.479*"oper" + 0.372*"stg" + -0.339*"billion" + -0.234*"dlrs" + 0.151*"bank" + -0.136*"excludes" + -0.131*"february" + 0.126*"market" + -0.126*"january" + 0.116*"money"
Tópico 6: 0.5

In [28]:
coherence_model_lsa = CoherenceModel(model=lsi_model_1_1, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.32556757125540015


#### LDA

### Modelo 2

Removing stopwords, and stemming

In [31]:
def preprocess_data_LSA(doc_set):
    en_stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()
    texts = []
    for tokens in doc_set:
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    return texts

In [32]:
documents_stem = preprocess_data_LSA(documents)

In [33]:
dictionary_stem = Dictionary(documents_stem)
doc_term_stem = [dictionary_stem.doc2bow(doc) for doc in documents_stem]

#### LSA

In [34]:
lsi_model_2 = LSA_model(doc_term_stem)

In [35]:
for i, topic in lsi_model_2.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

Tópico 0: 0.529*"official" + 0.382*"japan" + 0.298*"cost" + 0.201*"many" + 0.190*"ready" + 0.185*"serves" + 0.134*"reuter" + 0.130*"australian" + 0.115*"have" + 0.110*"serious"
Tópico 1: 0.529*"ready" + 0.442*"japan" + -0.384*"official" + 0.226*"star" + 0.213*"century" + 0.189*"majority" + 0.176*"has" + 0.143*"cost" + -0.131*"many" + -0.116*"reuter"
Tópico 2: 0.903*"lyng" + 0.172*"attached" + 0.168*"ceilings" + 0.147*"replaces" + -0.097*"cost" + 0.088*"handling" + 0.085*"container" + 0.076*"director" + 0.063*"ita" + 0.063*"largest"
Tópico 3: 0.458*"cost" + -0.412*"ready" + 0.337*"japan" + -0.249*"star" + -0.204*"reuter" + -0.174*"majority" + -0.154*"restraining" + 0.152*"australian" + 0.149*"outlined" + -0.137*"has"
Tópico 4: -0.535*"many" + 0.261*"official" + -0.252*"australian" + 0.238*"outlined" + 0.196*"everything" + 0.186*"cost" + 0.182*"have" + -0.181*"vermin" + -0.178*"confident" + -0.174*"serves"
Tópico 5: 0.569*"japan" + -0.340*"many" + -0.282*"cost" + 0.199*"the" + -0.189*"st

In [36]:
coherence_model_lsa = CoherenceModel(model=lsi_model_2, texts=documents_stem, dictionary=dictionary_stem, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.2633238143421834


#### LDA

### Modelo 2.1 com TF-IDF

In [42]:
tfidf_2 = TfidfModel(doc_term_stem)
doc_term_stem_tfidf = tfidf_2[doc_term_stem]

#### LSA

In [43]:
lsi_model_2_1 = LSA_model(doc_term_stem_tfidf)

In [44]:
for i, topic in lsi_model_2_1.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

Tópico 0: 0.609*"ready" + 0.311*"star" + 0.300*"has" + 0.283*"century" + 0.281*"majority" + 0.248*"japan" + 0.179*"comprehensive" + 0.168*"totalled" + 0.142*"spotlight" + 0.123*"cost"
Tópico 1: -0.227*"australian" + -0.221*"many" + -0.206*"vermin" + -0.146*"reuter" + -0.144*"confident" + -0.141*"official" + -0.122*"restraining" + 0.122*"ready" + -0.113*"import" + -0.113*"cost"
Tópico 2: 0.758*"has" + 0.285*"totalled" + -0.209*"star" + -0.200*"ready" + -0.181*"stubborn" + -0.181*"believe" + -0.139*"mounting" + -0.128*"economics" + -0.126*"smithson" + -0.124*"among"
Tópico 3: 0.340*"has" + 0.322*"stubborn" + 0.303*"believe" + 0.256*"mounting" + 0.242*"star" + 0.240*"smithson" + 0.236*"economics" + -0.231*"australian" + 0.228*"among" + -0.196*"japan"
Tópico 4: -0.362*"members" + -0.335*"australian" + 0.302*"the" + -0.274*"vermin" + -0.191*"cases" + 0.138*"director" + 0.134*"nsw" + -0.123*"has" + 0.115*"who" + -0.104*"introduction"
Tópico 5: 0.531*"australian" + -0.434*"members" + -0.199*"

In [45]:
coherence_model_lsa = CoherenceModel(model=lsi_model_2_1, texts=documents_stem, dictionary=dictionary_stem, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.33477781645232574


#### LDA

### Modelo 3

Removing stopwords, and lemmatization and relevant words (Pos tagging)

In [None]:
def preprocess_data_LSA_2(doc_set):
    nlp = spacy.load("en_core_web_sm")
    en_stop = set(stopwords.words('english'))
    texts = []
    for tokens in doc_set:
        doc = nlp(" ".join(tokens))
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if token.lemma_.lower() not in en_stop  
            and token.pos_ in ("NOUN", "PROPN", "ADJ")  
        ] 
        texts.append(lemmas)
    return texts

In [None]:
nlp = spacy.load("en_core_web_sm")
en_stop = set(stopwords.words('english'))

def preprocess_data_LSA_2_1(doc_set, bigram_flag, min_len=0):
    texts = []
    for tokens in doc_set:
        doc = nlp(" ".join(tokens))
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if token.lemma_.lower() not in en_stop  
            and token.pos_ in ("NOUN", "PROPN", "ADJ")  
        ] 
        if len(lemmas) >= min_len:
            texts.append(lemmas)
    
    if bigram_flag:
        bigram = Phrases(texts, min_count=5, threshold=15)
        bigram_mod = Phraser(bigram)
        texts = [bigram_mod[doc] for doc in texts]
        
    return texts

In [None]:
documents_lemma = preprocess_data_LSA_2_1(documents, 1, 10)

In [None]:
dictionary_lemma = Dictionary(documents_lemma)
doc_term_lemma = [dictionary_lemma.doc2bow(doc) for doc in documents_lemma]

#### LSA

In [None]:
lsi_model_3 = LSA_model(doc_term_lemma, 30)

In [None]:
for i, topic in lsi_model_3.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

In [None]:
coherence_model_lsa = CoherenceModel(model=lsi_model_3, texts=documents_lemma, dictionary=dictionary_lemma, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

#### LDA

### Modelo 3.1 com TF-IDF

In [None]:
tfidf_3 = TfidfModel(doc_term_lemma)
doc_term_lemma_tfidf = tfidf_2[doc_term_lemma]

#### LSA

In [None]:
lsi_model_3_1 = LSA_model(doc_term_lemma_tfidf)

In [None]:
for i, topic in lsi_model_3_1.print_topics(num_topics=10):
    print(f"Tópico {i}: {topic}")

In [None]:
coherence_model_lsa = CoherenceModel(model=lsi_model_3_1, texts=documents_lemma, dictionary=dictionary_lemma, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

#### LDA

### Word cloud