# Explore here

In [2]:
# Librerías

import pandas as pd

In [3]:
# Creamos DataFrame a partir de csv

df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


Nota: Debido a que nuestra única variable predictora es un texto, no es necesario realizar un EDA

In [4]:
# Eliminamos package_name (nombre de la app), no es relevante en nuestro problema

df = df.drop('package_name', axis=1)
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [6]:
# Eliminamos espacios y convertimos a minúsculas el texto

df["review"] = df["review"].str.strip().str.lower()
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [19]:
# Separamos en entrenamiento y test

from sklearn.model_selection import train_test_split

X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [20]:
# Transformamos el texto en matriz de recuento de palabras

from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english") #parámetro para definir palabras como conectores y no implican profundidad
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

len(X_train_vec[0]) #Conteo de palabras 


3310

Nota: Esta es una forma de obtener características numéricas a partir del texto

In [26]:
# Definimos modelo Multinomial y entrenamos

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)


Nota: Elegimos el modelo MultinomialNB ya es una clasificación con esta distribución, lo que indica que va hacer uso de desiciones anidadas para llegar a la predicción.

In [27]:
y_pred = model.predict(X_test_vec)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [32]:
from sklearn.metrics import accuracy_score

print(f'Multinomial: {accuracy_score(y_pred,y_test)}')

Multinomial: 0.8156424581005587


In [34]:
# Probamos modelos Gaussiano y Bernoulli y comparamos

from sklearn.naive_bayes import GaussianNB,BernoulliNB

model2 = GaussianNB()
model3 = BernoulliNB()

model2.fit(X_train_vec,y_train)
model3.fit(X_train_vec,y_train)

print(f'Gaussian: {accuracy_score(model2.predict(X_test_vec),y_test)} \nBernoulli: {accuracy_score(model3.predict(X_test_vec),y_test)} \nMultinomial: {accuracy_score(y_pred,y_test)}')

Gaussian: 0.8044692737430168 
Bernoulli: 0.770949720670391 
Multinomial: 0.8156424581005587


In [36]:
# Probamos utilizando un Random Forest

from sklearn.ensemble import RandomForestClassifier

model4 = RandomForestClassifier(random_state=42)

model4.fit(X_train_vec,y_train)

print(f'Random Forest: {accuracy_score(model4.predict(X_test_vec),y_test)}')

Random Forest: 0.7988826815642458


In [47]:
# Utilizamos la busqueda de grilla para mejorar el Random Forest

from sklearn.model_selection import GridSearchCV

grid = {
    'n_estimators':[10,20,50,100,110,80,90,150,200],
    'bootstrap':[True,False],
    'max_depth':[1,2,3,4,5,6,None]
}

grid_search = GridSearchCV(model4,grid,scoring='accuracy',cv=5) #cv : verificación cruzada

grid_search.fit(X_train_vec,y_train)

best_params_df = pd.DataFrame([grid_search.best_params_])
best_params_df

Unnamed: 0,bootstrap,max_depth,n_estimators
0,False,,100


In [49]:
#Comparamos los Random Forest

best_rf_model = grid_search.best_estimator_

print(f'Random Forest: {accuracy_score(model4.predict(X_test_vec),y_test)}\nBest Random Forest: {accuracy_score(best_rf_model.predict(X_test_vec),y_test)} ')

Random Forest: 0.7988826815642458
Best Random Forest: 0.8100558659217877 


In [50]:
# Utilizamos la busqueda de grilla en nuestro mejor modelo (MultinomialNB)

MultinomialNB()
grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(model,grid,scoring='accuracy',cv=5)

grid_search.fit(X_train_vec,y_train)

best_params_df = pd.DataFrame([grid_search.best_params_])
best_params_df

Unnamed: 0,alpha,fit_prior
0,2.0,False
