# Playstore reviews - Ánalisis de sentimientos

In [44]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from pickle import dump
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV


# 1. Read the data collection

In [35]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
total_data.head(5)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


# 2. Exploration and data cleaning
# 2.1 Understanding the features:

- **Package_name**. Nombre de la aplicación móvil (c)
- **Review**. Comentario sobre la aplicación móvil (c)
- **Polarity**. Variable de clase (0 o 1), siendo 0 un comentario negativo y 1, positivo (n)

c -- categórico  n -- numérico

In [4]:
# Obtener las dimensiones
print('Our dataframe contains {} rows, and has a shape of {}.'.format(len(total_data), total_data.shape))

Our dataframe contains 891 rows, and has a shape of (891, 3).


In [5]:
# Obtener información sobre tipos de datos y valores no nulos
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


# 2.3 Processing the text

In [6]:
#Eliminar espacios y convertir a minúsculas el texto
total_data["review"] = total_data["review"].str.strip().str.lower()
total_data["package_name"] = total_data["package_name"].str.strip().str.lower()
total_data.drop("package_name", axis=1, inplace=True)
total_data.head(5)

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


# 3. Division of the data collection for training and testing

In [7]:
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head(5)

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

# 4. Transforming the text in a counting of words matrix

In [8]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 5. Naive Bayes Model

In [9]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8156424581005587

In [10]:
model_Gau = GaussianNB()
model_Gau.fit(X_train, y_train)
y_test_pred = model_Gau.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.8044692737430168

In [11]:
model_Ber = BernoulliNB()
model_Ber.fit(X_train, y_train)
y_test_pred = model_Ber.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.770949720670391

# 6. Optimization of the Model

In [41]:
hyperparams = {
    "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
    "fit_prior": [True, False],
}

model = MultinomialNB()
random_search = RandomizedSearchCV(
    model,
    hyperparams,
    n_iter=50,
    scoring='accuracy',
    cv=10,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("The best hyperparameters are:", random_search.best_params_)
print("The best accuracy is:", random_search.best_score_)



The best hyperparameters are: {'fit_prior': False, 'alpha': 1}
The best accuracy is: 0.8229460093896714


In [46]:
dump(model, open("../models/naive_bayes_alpha_1_fit_prior_False_42.sav", "wb"))