# Naive Bayes: Sentiment Analysis

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Classification Metrics
from sklearn.metrics import accuracy_score  # importing metrics

# Naive Bayes
import sklearn.naive_bayes  # Scikit-learn Naive Bayes package

# Naive Bayes Models
from sklearn.naive_bayes import GaussianNB  # Gaussian Naive Bayes
from sklearn.naive_bayes import MultinomialNB  # Multinomial Naive Bayes
from sklearn.model_selection import train_test_split  # dataset splitting utility

## Step 1: Loading the dataset

In [5]:
import pandas as pd

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


## Step 2: Study of variables and their content

### Removing spaces and converting the text to lowercase and drop nonuseful

In [6]:
def apply_preprocess(df):
    df = df.drop("package_name", axis=1) #drop 
    df["review"] = df["review"].str.strip().str.lower()

    return df

total_data = apply_preprocess(total_data)

total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


### Divide the dataset into train and test

In [7]:
from sklearn.model_selection import train_test_split

X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

### Transform the text into a word count matrix

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Step 3: Build a naive bayes model

I select the MultinomialNB because just the target is binary while the predictors are categorical numbers.

In [9]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [11]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8156424581005587

In [12]:
# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the model
classifier.fit(X_train, y_train)

# Predict on the training set
y_pred_train = classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train Set:", accuracy_train)

# Make predictions on the test set
y_pred_test = classifier.predict(X_test)
print('Predictions on test set:', y_pred_test)

# Calculate the model accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Model accuracy on Test Set:", accuracy_test)


Model accuracy on Train Set: 0.9606741573033708
Predictions on test set: [0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0
 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 1 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0]
Model accuracy on Test Set: 0.8156424581005587


I will test the other sklearn Naive Bayes models:

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Create a Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Train the model
classifier.fit(X_train, y_train)

# Predict on the training set
y_pred_train = classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train Set:", accuracy_train)

# Make predictions on the test set
y_pred_test = classifier.predict(X_test)
print('Predictions on test set:', y_pred_test)

# Calculate the model accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Model accuracy on Test Set:", accuracy_test)


Model accuracy on Train Set: 0.9859550561797753
Predictions on test set: [0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0
 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0]
Model accuracy on Test Set: 0.8044692737430168


In [14]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

# Create a Bernoulli Naive Bayes classifier
classifier = BernoulliNB()

# Train the model
classifier.fit(X_train, y_train)

# Predict on the training set
y_pred_train = classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Model accuracy on Train Set:", accuracy_train)

# Make predictions on the test set
y_pred_test = classifier.predict(X_test)
print('Predictions on test set:', y_pred_test)

# Calculate the model accuracy on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Model accuracy on Test Set:", accuracy_test)


Model accuracy on Train Set: 0.9199438202247191
Predictions on test set: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0]
Model accuracy on Test Set: 0.770949720670391


In [15]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

for model_aux in [GaussianNB(), BernoulliNB()]:
    model_aux.fit(X_train, y_train)
    y_pred_aux = model_aux.predict(X_test)
    print(f"{model_aux} with accuracy: {accuracy_score(y_test, y_pred_aux)}")

GaussianNB() with accuracy: 0.8044692737430168
BernoulliNB() with accuracy: 0.770949720670391


I can confirm that the best model is the one I have chosen based on its theoretical foundation.

## Step 4: Optimize the previous model

In [16]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

# We initialize the random search
random_search = RandomizedSearchCV(model, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

In [17]:
random_search.fit(X_train, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': 1.917638190954774}


After identifying the best hyperparameters, we re-trained the model

In [18]:
model = MultinomialNB(alpha = 1.917638190954774, fit_prior = False)
model.fit(X_train, y_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8212290502793296

## Step 5: Save the model

In [19]:
from pickle import dump

dump(model, open("/workspaces/Naive_Bayes_Al/models_fit_prior_False_alpha_42.sav", "wb"))



## Step 6: Explore other alternatives

**Modelo alternativo 1 Random Forest Classifier**

Random Forest Classifier:
Los bosques aleatorios son un conjunto de árboles de decisión entrenados con muestras bootstrap de los datos y promediados para mejorar la precisión y controlar el sobreajuste.
Argumento: Los bosques aleatorios pueden capturar relaciones no lineales y complejas entre las características y la variable objetivo, lo que puede mejorar el rendimiento sobre un modelo Naive Bayes.

In [20]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [21]:
def apply_preprocess(df):
    df = df.drop("package_name", axis=1) #drop 
    df["review"] = df["review"].str.strip().str.lower()

    return df

total_data = apply_preprocess(total_data)

total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [22]:
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [25]:
y_pred_rf = rf_model.predict(X_test)
y_pred_rf

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0])

In [26]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8212290502793296

**optimizar**

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# Definir los hiperparámetros a ajustar
param_dist = {
    'n_estimators': randint(10, 200),
    'max_depth': [None] + list(randint.rvs(3, 20, size=5)),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Crear una instancia del modelo RandomForestClassifier
rf_model = RandomForestClassifier()

# Realizar la búsqueda aleatoria
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

# Ajustar el modelo a los datos de entrenamiento
random_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros encontrados
print("Best Parameters:", random_search.best_params_)

# Obtener la precisión del mejor modelo
best_rf_model = random_search.best_estimator_
accuracy = best_rf_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)



Best Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 12, 'n_estimators': 110}
Model Accuracy: 0.8268156424581006


**Modelo alternativo 2 Regresssion Logistica**

Regresión Logística:
La regresión logística es un modelo lineal que se utiliza comúnmente para problemas de clasificación binaria, pero también se puede extender para problemas de clasificación multiclase.
Argumento: La regresión logística puede ser más flexible en términos de cómo modela la relación entre las características y las clases, especialmente si hay interacciones no lineales entre las características y la variable objetivo.

In [28]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [29]:
def apply_preprocess(df):
    df = df.drop("package_name", axis=1) #drop 
    df["review"] = df["review"].str.strip().str.lower()

    return df

total_data = apply_preprocess(total_data)

total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [30]:
X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

In [33]:
y_pred_log = logreg_model.predict(X_test)# cambiar nombre para evitar
y_pred_log

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0])

In [34]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8212290502793296

**Optimizar**

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

# Definir los hiperparámetros a ajustar
param_dist = {
    'C': uniform(loc=0, scale=4),  # Parámetro de regularización
    'penalty': ['l1', 'l2'],  # Tipo de penalización
    'solver': ['liblinear', 'saga']  # Algoritmo de optimización
}

# Crear una instancia del modelo LogisticRegression
logreg_model = LogisticRegression()

# Realizar la búsqueda aleatoria
random_search = RandomizedSearchCV(estimator=logreg_model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)

# Ajustar el modelo a los datos de entrenamiento
random_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros encontrados
print("Best Parameters:", random_search.best_params_)

# Obtener la precisión del mejor modelo
best_logreg_model = random_search.best_estimator_
accuracy = best_logreg_model.score(X_test, y_test)
print("Model Accuracy:", accuracy)




Best Parameters: {'C': 2.7369321060486276, 'penalty': 'l1', 'solver': 'saga'}
Model Accuracy: 0.8044692737430168


