# Explore here

In [1]:
# Your code here

import pandas as pd

# Load the dataset from the provided URL
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

# Display the first few rows to inspect the data
df.head()


Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [2]:
# Drop the package_name column
df = df.drop(columns=['package_name'])

# Clean the review column: remove spaces, convert to lowercase
df['review'] = df['review'].str.strip().str.lower()

# Check the dataset after processing
df.head()


Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [3]:
from sklearn.model_selection import train_test_split

# Define the features (X) and target (y)
X = df['review']
y = df['polarity']

# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vec_model = CountVectorizer(stop_words='english')

# Fit the vectorizer on the training data and transform both train and test sets
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

# Verify the shape of the transformed matrices
X_train.shape, X_test.shape


((712, 3310), (179, 3310))

In [5]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# Initialize the models
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

# Train the models and make predictions
gnb.fit(X_train, y_train)
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

y_pred_gnb = gnb.predict(X_test)
y_pred_mnb = mnb.predict(X_test)
y_pred_bnb = bnb.predict(X_test)

# Calculate accuracy for each model
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
accuracy_bnb = accuracy_score(y_test, y_pred_bnb)

# Print the accuracies
print(f"GaussianNB Accuracy: {accuracy_gnb}")
print(f"MultinomialNB Accuracy: {accuracy_mnb}")
print(f"BernoulliNB Accuracy: {accuracy_bnb}")


GaussianNB Accuracy: 0.8044692737430168
MultinomialNB Accuracy: 0.8156424581005587
BernoulliNB Accuracy: 0.770949720670391


In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf}")


Random Forest Accuracy: 0.7988826815642458


In [7]:
import joblib

# Save the Random Forest model (or another model) to a file
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [8]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
log_model = LogisticRegression(random_state=42, max_iter=1000)
log_model.fit(X_train, y_train)

# Make predictions and evaluate the Logistic Regression model
y_pred_log = log_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)

print(f"Logistic Regression Accuracy: {accuracy_log}")


Logistic Regression Accuracy: 0.8324022346368715


In [9]:
from pickle import dump


In [11]:
# Save GaussianNB model
dump(gnb, open("../models/gaussian_nb_model.sav", "wb"))

# Save MultinomialNB model
dump(mnb, open("../models/multinomial_nb_model.sav", "wb"))

# Save BernoulliNB model
dump(bnb, open("../models/bernoulli_nb_model.sav", "wb"))

# Save Random Forest model
dump(rf_model, open("../models/random_forest_model.sav", "wb"))

# Save Logistic Regression model
dump(log_model, open("../models/logistic_regression_model.sav", "wb"))
