# AC

Este notebook implementa um pipeline completo de **Análise de Sentimentos** para o corpus **B2W‑Reviews01**.
O objetivo é classificar avaliações em três categorias: 
- **positiva**
- **negativa**
- **neutra**

## Imports

In [None]:
import pandas as pd
import numpy as np
import re, string, html
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib
import nltk, spacy, warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
from nltk.corpus import stopwords

# spaCy Portuguese model
try:
    nlp = spacy.load('pt_core_news_sm')
except OSError:
    !python -m spacy download pt_core_news_sm
    nlp = spacy.load('pt_core_news_sm')

stop_words = set(stopwords.words('portuguese'))

## Data Ingestion

In [None]:
DATA_PATH = 'B2W-Reviews01.csv'

df = pd.read_csv(DATA_PATH)
print(f"Shape before dropna: {df.shape}")

df = df.dropna(subset=['review_text'])
print(f"Shape after dropna: {df.shape}")

df.head()

## Label Engineering

In [None]:
def rating_to_sentiment(r):
    if r <= 2:
        return 'neg'
    elif r == 3:
        return 'neu'
    else:
        return 'pos'

if 'rating' in df.columns:
    df['sentiment'] = df['rating'].apply(rating_to_sentiment)
elif 'polarity' in df.columns:
    df['sentiment'] = df['polarity']
else:
    raise KeyError('Coluna de rating/polarity não encontrada.')

df['sentiment'].value_counts()

## Text Cleaning & Lemmatization

In [None]:
def clean_text(text):
    text = html.unescape(text)
    text = re.sub('<.*?>', ' ', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
    text = re.sub('\\d+', ' ', text)
    text = text.lower()
    tokens = [t for t in text.split() if t not in stop_words and len(t) > 2]
    doc = nlp(' '.join(tokens))
    lemmas = [tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-']
    return ' '.join(lemmas)

df['clean_review'] = df['review_text'].astype(str).apply(clean_text)
df[['review_text', 'clean_review']].head()

## Split

In [None]:
X = df['clean_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print(X_train.shape, X_test.shape)

## Define Pipelines

In [None]:
pipelines = {
    'NB_BoW': Pipeline([
        ('vect', CountVectorizer(min_df=5)),
        ('clf', MultinomialNB())
    ]),
    'LR_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5, ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=200, n_jobs=-1, C=5.0))
    ]),
    'SVM_TFIDF': Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5, ngram_range=(1,2))),
        ('clf', LinearSVC(C=1.0))
    ])
}

## Train & Evaluate Benchmarks

In [None]:
results = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True)
    results[name] = report['macro avg']['f1-score']
    
    print(f"\nModel: {name}")
    print(classification_report(y_test, preds))

## Select Best

In [None]:
best_model_name = max(results, key=results.get)
best_pipeline = pipelines[best_model_name]

print(f"Best baseline model: {best_model_name} | F1-macro = {results[best_model_name]:.3f}")

## Modelos (LightGBM + Word2Vec)

In [None]:
pip install gensim lightgbm optuna

In [None]:
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

In [None]:
tok_corpus = [row.split() for row in X_train]
w2v = Word2Vec(sentences=tok_corpus, vector_size=100, window=5, min_count=3, workers=4, epochs=10)

def avg_w2v(sentence):
    tokens = sentence.split()
    vecs = [w2v.wv[t] for t in tokens if t in w2v.wv]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(100)

X_train_w2v = np.vstack(X_train.apply(avg_w2v))
X_test_w2v = np.vstack(X_test.apply(avg_w2v))

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

lgbm = LGBMClassifier(n_estimators=300, learning_rate=0.1, num_leaves=31)
lgbm.fit(X_train_w2v, y_train_enc)
preds = lgbm.predict(X_test_w2v)
print("\nLightGBM+Word2Vec")
print(classification_report(y_test_enc, preds, target_names=le.classes_))

## Persist Best Model

In [None]:
MODEL_PATH = 'sentiment_pipeline.joblib'
joblib.dump(best_pipeline, MODEL_PATH)

print(f"Pipeline salvo em {MODEL_PATH}")

## Quick Inference

In [None]:
sample_text = "Entrega super rápida, produto de excelente qualidade. Recomendo!"
clean_sample = clean_text(sample_text)
pred = best_pipeline.predict([clean_sample])[0]
print(f"Texto: {sample_text}\nSentimento previsto: {pred}")