In [None]:
!pip install nltk
!pip install spacy
!python -m spacy download pt_core_news_sm
nltk.download('punkt')

In [14]:
"""
João Assalim
https://www.kaggle.com/datasets/augustop/portuguese-tweets-for-sentiment-analysis?select=NoThemeTweets.csv
"""

'\nJoão Assalim\nhttps://www.kaggle.com/datasets/augustop/portuguese-tweets-for-sentiment-analysis?select=NoThemeTweets.csv\n'

In [55]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

import string

In [53]:
df = pd.read_csv('./olist.csv')
df = df.drop(["review_text", "review_text_tokenized", "polarity", "kfold_rating", "kfold_polarity", "original_index"], axis=1)
df.head()

Unnamed: 0,review_text_processed,rating
0,perfeito....chegou antes do prazo.....,5
1,foi uma otima compra! chegou antes mesmo do pr...,5
2,recebi muito rapido e um otimo custo beneficio,5
3,recomendo,5
4,so veio uma capa comprei 3 ai paguei. mais de ...,1


In [56]:
df["rating"] = np.where(df["rating"] < 3, 0, 1)

In [59]:
df.shape

(41744, 2)

In [60]:
df.isnull().sum()

review_text_processed    1
rating                   0
dtype: int64

In [61]:
df = df.dropna()

In [62]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [63]:
stop_words = nltk.corpus.stopwords.words('portuguese')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41743 entries, 0 to 41743
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   review_text_processed  41743 non-null  object
 1   rating                 41743 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 978.4+ KB


In [65]:
tweets = df["review_text_processed"]
tokenization = [word_tokenize(text.lower()) for text in tweets]

In [66]:
lemmatizer = WordNetLemmatizer()

new_tweets = []

for phrase in tokenization:
  new_phrase = ""
  for token in phrase:
    if not str(token) in stop_words and not token in string.punctuation and "@" not in token and "http" not in token and len(token) > 1 and not token.isdigit():
      new_phrase += lemmatizer.lemmatize(str(token)) + " "
  new_tweets.append(new_phrase[:-1])

In [67]:
df["review_text_processed"] = new_tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["review_text_processed"] = new_tweets


In [70]:
vect_uni_cv = CountVectorizer(ngram_range=(1,1), stop_words=stop_words)
text_vect_uni_cv = vect_uni_cv.fit_transform(df["review_text_processed"])

X_trainUCV, X_testUCV, y_trainUCV, y_testUCV = train_test_split(text_vect_uni_cv, df["rating"], test_size=0.2, random_state=42)

vect_uni_idf = TfidfVectorizer(ngram_range=(1,1), use_idf=True, norm='l2', stop_words=stop_words)
text_vect_uni_idf = vect_uni_idf.fit_transform(df["review_text_processed"])

X_trainUIDF, X_testUIDF, y_trainUIDF, y_testUIDF = train_test_split(text_vect_uni_idf, df["rating"], test_size=0.2, random_state=42)

In [71]:
rfcUCV = RandomForestClassifier()

rfcUCV.fit(X_trainUCV, y_trainUCV)
y_predUCV = rfcUCV.predict(X_testUCV)

acUCV = accuracy_score(y_testUCV, y_predUCV)

print(f'Score Count Vectorizer Random Forest: {acUCV*100:.2f}%')

Score Count Vectorizer Random Forest: 88.81%


In [72]:
rfcidf = RandomForestClassifier()

rfcidf.fit(X_trainUIDF, y_trainUIDF)
y_predUIDF = rfcidf.predict(X_testUIDF)

acUIDF = accuracy_score(y_testUIDF, y_predUIDF)

print(f'Score TFIDF Random Forest: {acUIDF*100:.2f}%')

Score TFIDF Random Forest: 89.18%


In [73]:
dtrUVC = DecisionTreeClassifier()

dtrUVC.fit(X_trainUCV, y_trainUCV)

acUCV = dtrUVC.score(X_testUCV, y_testUCV)

print(f'Score Count Vectorizer Decision Tree Classifier: {acUCV*100:.2f}%')

Score Count Vectorizer Decision Tree Classifier: 84.88%


In [74]:
dtridf = DecisionTreeClassifier()

dtridf.fit(X_trainUIDF, y_trainUIDF)

acidf = dtridf.score(X_testUCV, y_testUCV)

print(f'Score TFIDF Decision Tree Classifier: {acidf*100:.2f}%')

Score TFIDF Decision Tree Classifier: 71.10%


In [75]:
import joblib
model_filename = 'sentiment_analyser.pkl'
joblib.dump((rfcidf, vect_uni_idf), model_filename)

['sentiment_analyser.pkl']