# Notebook 2 – Treino e Previsão

Treina um modelo novo (TF‑IDF + RandomForest) e rotula uma amostra de 5 000 docs.

In [None]:
import pandas as pd, numpy as np, joblib
from pymongo import MongoClient
from elasticsearch import Elasticsearch, helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

mongo = MongoClient('mongodb://localhost:27017')['lab_ml']
raw_docs = list(mongo.tweets_raw.find({}, {'clean_text':1, 'Label':1, '_id':0}))
df = pd.DataFrame(raw_docs)

label_map = {0:'Politics',1:'Sport',2:'Technology',3:'Entertainment',4:'Business'}
df['Label'] = df['Label'].map(label_map)

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['Label'], test_size=0.3, random_state=42, stratify=df['Label']
)

pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=300, random_state=42))
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print('Acurácia:', accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

joblib.dump(pipeline, 'rf_tfidf_pipeline.joblib')

sample = list(mongo.tweets_raw.aggregate([{'$sample': {'size': 5000}}]))
sample_df = pd.DataFrame(sample)
sample_df['predicted_label'] = pipeline.predict(sample_df['clean_text'])

mongo.tweets_predicted.delete_many({})
mongo.tweets_predicted.insert_many(sample_df.to_dict('records'))

es = Elasticsearch('http://localhost:9200')
helpers.bulk(es, ({
    '_index': 'tweets_predicted',
    '_id': str(i),
    '_source': rec
} for i, rec in sample_df.to_dict('index').items()))

print('✅  Amostra rotulada e salva:', len(sample_df))
