 # 0. Import libraries

In [113]:
# Basics Imports
import pandas as pd 
import numpy as np
import datetime
from pickle import dump


# Sklearn Imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

---

# 1. Problem statement and data collection

## 1.1 Description of the problem

In this case, we have only 3 variables: 2 predictors and a dichotomous label. Of the two predictors, we are really only interested in the comment part, since the fact of classifying a comment as positive or negative will depend on its content, not on the application from which it was written. Therefore, the package_name variable should be removed.

## 1.2 Data collection

In [114]:
path = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"

df_download = pd.read_csv(path)
df_download.to_csv("../data/raw/playstore_reviews.csv", index=False)

df = pd.read_csv("/workspace/naive_bayes/data/raw/playstore_reviews.csv")
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


## 1.3 Understanding the features

In [115]:
df.columns

Index(['package_name', 'review', 'polarity'], dtype='object')

- ``package_name`` -> Name of the mobile application (categorical)
- ``review`` -> Comment about the mobile application (categorical)
- ``polarity`` -> Class variable (0 or 1), being 0 a negative comment and 1, positive (numeric)

## 1.4 Data exploration

In [116]:
print(f'Our dataframe contains {len(df)} rows and it has {df.shape[1]} features.')

Our dataframe contains 891 rows and it has 3 features.


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [118]:
df.drop(columns="package_name", inplace=True)
df["review"] = df["review"].str.lower().str.strip()
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


---

# 3. Train and Test Split

## 3.1 Split df

In [119]:
X = df["review"]
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head(3)

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
Name: review, dtype: object

## 3.2 Vectorizer

In [120]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

---

# 4. Machine Learning

## 4.1 Model Naive Bayes

In [121]:
model = MultinomialNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Model: {model} / accuracy: {round(accuracy*100,2)}%')

Model: MultinomialNB() / accuracy: 81.56%


In [122]:
models = [GaussianNB(), BernoulliNB()]

for i in models:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f'Model: {model} / accuracy: {round(accuracy*100,2)}%')


Model: MultinomialNB() / accuracy: 80.45%
Model: MultinomialNB() / accuracy: 77.09%


## 4.2 Model Optimization

In [123]:
hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

random_search = RandomizedSearchCV(model, hyperparams, n_iter=50, scoring="accuracy", cv=5, random_state=42)
random_search

In [124]:
random_search.fit(X_train, y_train)
print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': np.float64(1.917638190954774)}


In [125]:
model = MultinomialNB(fit_prior=False, alpha=1.917638190954774)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Model: {model} / accuracy: {round(accuracy*100,2)}%')

Model: MultinomialNB(alpha=1.917638190954774, fit_prior=False) / accuracy: 82.12%


# 5. Save model

In [126]:
dump(model, open("/workspace/naive_bayes/models/naive_bayes_fit_prior_False_alpha_1-917638190954774.sav", "wb"))