In [None]:
# Install libraries if needed
# pip install pandas scikit-learn nltk tqdm

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm
import nltk
import re


In [None]:
df= pd.read_csv("/content/olist_order_reviews_dataset.csv")


In [None]:
df.isna().sum()

Unnamed: 0,0
review_id,0
order_id,0
review_score,0
review_comment_title,87656
review_comment_message,58247
review_creation_date,0
review_answer_timestamp,0


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
# Download Portuguese stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('portuguese')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:


# Keep only comments that exist
df = df[df["review_comment_message"].notnull()].reset_index(drop=True)

# 🧹 Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r"[^a-zA-Záéíóúãõâêîôûç\s]", '', text)  # keep only letters
    text = re.sub(r"\s+", ' ', text).strip()
    return text

df["clean_text"] = df["review_comment_message"].apply(clean_text)



In [None]:
# -----------------------------------------
# ⚙️ Create temporary pseudo-labels
# -----------------------------------------
# Since your data has no labels, we’ll simulate them
# by assuming short/angry words are "Negative" and others "Positive"
# (You can replace this part with your manual labels later)

import random
df["label"] = [random.choice(["Positive", "Negative"]) for _ in range(len(df))]



In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)



In [None]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words=stop_words)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [None]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)



In [None]:
# Predict
df["sentiment"] = model.predict(tfidf.transform(df["clean_text"]))




In [None]:
# Save results
df[["review_comment_message", "sentiment"]].to_csv("sentiment_results_fast.csv", index=False)



In [None]:
print("✅ Sentiment analysis complete! Saved as sentiment_results_fast.csv")
print(df[["review_comment_message", "sentiment"]].head())

✅ Sentiment analysis complete! Saved as sentiment_results_fast.csv
                              review_comment_message sentiment
0              Recebi bem antes do prazo estipulado.  Negative
1  Parabéns lojas lannister adorei comprar pela I...  Negative
2  aparelho eficiente. no site a marca do aparelh...  Negative
3    Mas um pouco ,travando...pelo valor ta Boa.\r\n  Positive
4  Vendedor confiável, produto ok e entrega antes...  Negative


In [None]:
df['sentiment']

Unnamed: 0,sentiment
0,Negative
1,Negative
2,Negative
3,Positive
4,Negative
...,...
40972,Negative
40973,Positive
40974,Positive
40975,Positive


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df['order_id'].duplicated().sum()

np.int64(141)