In [38]:
import nltk
from nltk.corpus import reuters
import pandas as pd
import re

nltk.download('reuters')

files = reuters.fileids()

dados = []

for file_id in files:
    categorias = reuters.categories(file_id)
    texto = reuters.raw(file_id)

    dados.append({
        "file_id": file_id,
        "categorias": ", ".join(categorias),
        "texto": texto
    })

df = pd.DataFrame(dados)

df.to_csv("reuters.csv", index=False, encoding="utf-8")



[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\strik\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [62]:
df_reuters = pd.read_csv('reuters.csv')
df_random_samples = pd.read_csv('random_reuters_samples.csv')

print(f"Number of rows in reuters.csv: {len(df_reuters)}")
print(f"Number of rows in random_reuters_samples.csv: {len(df_random_samples)}")

Number of rows in reuters.csv: 10788
Number of rows in random_reuters_samples.csv: 10


### Análise Exploratória

In [44]:
import pandas as pd

In [45]:
reuters = pd.read_csv('reuters.csv')

In [46]:
reuters.head()

Unnamed: 0,file_id,categorias,texto
0,test/14826,trade,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,grain,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"crude, nat-gas",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"corn, grain, rice, rubber, sugar, tin, trade",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"palm-oil, veg-oil",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


In [47]:
reuters.describe()

Unnamed: 0,file_id,categorias,texto
count,10788,10788,10788
unique,10788,468,10657
top,test/14826,earn,26-FEB-1987\n 26-FEB-1987\n\n
freq,1,3923,7


In [48]:
categories = set()

for item in reuters["categorias"]:
    
    parts = item.split(",")
    
    for p in parts:
        p_clean = p.strip()
        if p_clean != "":
            categories.add(p_clean)
print(len(categories))

90


In [49]:
categories

{'acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc'}

Pré-Processamento Simples

In [50]:
def preprocess_dataframe(df, text_column="texto"):
    df = df.copy()

    df[text_column] = (
        df[text_column]
        .astype(str)
        .str.lower()
        .apply(lambda x: re.sub(r"[^a-z\s]", " ", x))
        .apply(lambda x: re.sub(r"\s+", " ", x).strip())
    )

    return df

In [63]:
df_simple = preprocess_dataframe(reuters, text_column="texto")
df_simple.head()

Unnamed: 0,file_id,categorias,texto
0,test/14826,trade,asian exporters fear damage from u s japan rif...
1,test/14828,grain,china daily says vermin eat pct grain stocks a...
2,test/14829,"crude, nat-gas",japan to revise long term energy demand downwa...
3,test/14832,"corn, grain, rice, rubber, sugar, tin, trade",thai trade deficit widens in first quarter tha...
4,test/14833,"palm-oil, veg-oil",indonesia sees cpo price rising sharply indone...


Separação dos 10 Documentos

In [64]:
df_simple10 = df_simple.sample(10, random_state=42)
df_simple = df_simple.drop(df_simple10.index).reset_index(drop=True)

print("Linhas no df_simple (original):", len(df_simple))
print("Linhas no df_reserved (10 docs):", len(df_simple10))
df_simple10.head()

Linhas no df_simple (original): 10778
Linhas no df_reserved (10 docs): 10


Unnamed: 0,file_id,categorias,texto
4593,training/12421,earn,bayer world group pre tax profit billion marks...
8353,training/6220,earn,marcor lt maar expects fiscal year profit marc...
3614,training/10921,earn,computer microfilm corp lt comi year net shr c...
10382,training/9348,ship,iran says has better weapons than silkworm ira...
8048,training/5707,earn,transamerica income lt tai monthly dividend sh...
