# Sentiment analysis

In [457]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

In [458]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
total_data.to_csv("/workspaces/ml-proyect-naive-bayes/data/raw/playstore_reviews.csv")

In [459]:
total_data

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [460]:
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

In [461]:
total_data["review"] = total_data["review"].apply(expand_contractions)

In [462]:
total_data.drop("package_name", axis=1, inplace=True)

In [463]:
total_data["review"] = total_data["review"].str.strip().str.lower()

In [464]:
total_data["review"].head(10)

0    privacy at least put some option appear offlin...
1    messenger issues ever since the last update, i...
2    profile any time my wife or anybody has more t...
3    the new features suck for those of us who do n...
4    forced reload on uploading pic on replying com...
5    i do not know i cannot edit my posts? things s...
6    major flaws constant updates and always gettin...
7    video issues since i was forced into this upda...
8    this update completely destroyed my facebook. ...
9    posting issues for the last week, there is bee...
Name: review, dtype: object

In [465]:
total_data["review"] = total_data["review"].replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)

In [466]:
total_data["review"].head(10)

0    privacy at least put some option appear offlin...
1    messenger issues ever since the last update  i...
2    profile any time my wife or anybody has more t...
3    the new features suck for those of us who do n...
4    forced reload on uploading pic on replying com...
5    i do not know i cannot edit my posts  things s...
6    major flaws constant updates and always gettin...
7    video issues since i was forced into this upda...
8    this update completely destroyed my facebook  ...
9    posting issues for the last week  there is bee...
Name: review, dtype: object

In [467]:
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24, stratify= y)

X_train.head()

287    not letting me log in  i try to log into my ac...
815    thanks you so much for adding sanshkar fm kala...
241    kill pages i will enter a url or open a new li...
697    its really lovely apps   i am allready using a...
37     used to be good everything was great  until i ...
Name: review, dtype: object

In [468]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [469]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [470]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [471]:
y_pred = model.predict(X_train)
y_pred

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,

In [472]:
accuracy_score(y_train, y_pred)

0.9536516853932584

In [473]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0])

In [474]:
accuracy_score(y_test, y_pred)

0.8491620111731844

In [475]:
hyperparams = {"alpha": [1, 1.5, 1.8, 1.9, 2, 3, 4],
"force_alpha": [True, False],
"fit_prior": [True, False],
"class_prior": [[0.3, 0.7], [0.4, 0.6], [0.2, 0.8], None]}

In [476]:
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [477]:
grid.fit(X_train, y_train)

In [478]:
grid.best_params_

{'alpha': 3, 'class_prior': [0.3, 0.7], 'fit_prior': True, 'force_alpha': True}

In [479]:
best_model = grid.best_estimator_

In [480]:
y_pred = best_model.predict(X_train)
accuracy_score(y_train, y_pred)

0.9283707865168539

In [481]:
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8603351955307262

In [482]:
accs = []
for combination in ParameterGrid(hyperparams):
    model = MultinomialNB(**combination)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accs.append(accuracy_score(y_train, y_pred))

best_acc = max(accs)
best_combination = list(ParameterGrid(hyperparams))[accs.index(best_acc)]
best_model = MultinomialNB(**best_combination)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_train)
accuracy_score(y_train, y_pred)

0.9606741573033708

In [483]:
best_combination

{'alpha': 1, 'class_prior': None, 'fit_prior': False, 'force_alpha': True}

In [484]:
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8435754189944135