In [3]:
import pandas as pd
import re

# Exemplo de dados carregados diretamente (você precisará substituir pelo caminho correto do arquivo)
with open('data/test.ft.txt', 'r', encoding='utf-8') as f:
    raw_data = f.readlines()

# Função para separar o rótulo e o texto da revisão
def process_review_line(line):
    # Separar rótulo do texto
    match = re.match(r'(__label__\d) (.+)', line)
    if match:
        label = match.group(1)
        review = match.group(2)
        return label, review
    return None, None

# Processar todas as linhas e organizar em listas
labels = []
reviews = []

for line in raw_data:
    label, review = process_review_line(line.strip())
    if label and review:
        labels.append(label)
        reviews.append(review)

# Criar DataFrame organizado
df_test = pd.DataFrame({'label': labels, 'review': reviews})

# Exibir as primeiras linhas do DataFrame
print(df_test.head())

        label                                             review
0  __label__2  Great CD: My lovely Pat has one of the GREAT v...
1  __label__2  One of the best game music soundtracks - for a...
2  __label__1  Batteries died within a year ...: I bought thi...
3  __label__2  works fine, but Maha Energy is better: Check o...
4  __label__2  Great for the non-audiophile: Reviewed quite a...


In [4]:
# Mapeando os rótulos para valores numéricos
df_test['label'] = df_test['label'].map({'__label__1': 1, '__label__2': 2})
df_test.head()

Unnamed: 0,label,review
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Baixar pacotes do nltk necessários
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inicializar lematizador e obter lista de stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Função de tokenização e lematização
def tokenize(text):
    expression = r"\w+"
    match = re.findall(expression, text)
    tokens = [lemmatizer.lemmatize(m.lower()) for m in match if m.lower() not in stop_words]
    return tokens

# Aplicar ao DataFrame
df_test['tokens'] = df_test['review'].apply(tokenize)
print(df_test['tokens'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\joaom\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0    [great, cd, lovely, pat, one, great, voice, ge...
1    [one, best, game, music, soundtrack, game, rea...
2    [battery, died, within, year, bought, charger,...
3    [work, fine, maha, energy, better, check, maha...
4    [great, non, audiophile, reviewed, quite, bit,...
Name: tokens, dtype: object


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Dividir dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(df_test['review'], df_test['label'], test_size=0.2, random_state=42)

# Definir pipeline com Bag-of-Words e Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('clf', MultinomialNB())
])

# Treinar o modelo
pipeline.fit(X_train, y_train)

# Avaliar o modelo
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           1       0.84      0.85      0.84     39896
           2       0.85      0.83      0.84     40104

    accuracy                           0.84     80000
   macro avg       0.84      0.84      0.84     80000
weighted avg       0.84      0.84      0.84     80000



In [11]:
from sklearn.metrics import classification_report, balanced_accuracy_score

# Avaliar o modelo com balanced_accuracy_score
y_pred = pipeline.predict(X_test)
print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))

# Exibir o classification report
print(classification_report(y_test, y_pred))


Balanced Accuracy Score: 0.8427116242305798
              precision    recall  f1-score   support

           1       0.84      0.85      0.84     39896
           2       0.85      0.83      0.84     40104

    accuracy                           0.84     80000
   macro avg       0.84      0.84      0.84     80000
weighted avg       0.84      0.84      0.84     80000



In [13]:
# Acessar as palavras mais importantes
vectorizer = pipeline.named_steps['vect']
classifier = pipeline.named_steps['clf']

# Obter as palavras do vocabulário
feature_names = vectorizer.get_feature_names_out()

# Obter os log-probabilidades das palavras para cada classe
class_log_prob = classifier.feature_log_prob_

# Identificar as 10 palavras mais importantes para cada classe
top_words_class_1 = class_log_prob[0].argsort()[-10:][::-1]  # Top 10 palavras para __label__1 (negativas)
top_words_class_2 = class_log_prob[1].argsort()[-10:][::-1]  # Top 10 palavras para __label__2 (positivas)

# Exibir as palavras mais importantes
print("Top palavras para __label__1 (negativas):", [feature_names[i] for i in top_words_class_1])
print("Top palavras para __label__2 (positivas):", [feature_names[i] for i in top_words_class_2])


Top palavras para __label__1 (negativas): ['book', 'one', 'like', 'would', 'time', 'get', 'good', 'movie', 'even', 'product']
Top palavras para __label__2 (positivas): ['book', 'great', 'one', 'good', 'like', 'love', 'read', 'time', 'well', 'get']
