# Machine Learning: Naive Bayes

In [None]:
import numpy as np # Import NumPy library to include multi-dimensional arrays and matrices
import pandas as pd # Import Pandas library to allow for data manipulation and analysis

# Library to split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
# The CountVectorizer is used to compute the number of words in a given sample
from sklearn.feature_extraction.text import CountVectorizer
# sklearn.naive bayes is used to import the MultinomialNB library
from sklearn.naive_bayes import MultinomialNB
# The classification report, confusion matrix, accuracy score are imported from sklearn.metrics to print performance
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Filename of dataset
fname = '/content/dataset_elec_4000.csv'
# The Pandas read csv() function imports a CSV file to DataFrame format
dataset = pd.read_csv(fname, delimiter = ',')
# Dataset will then be represented as a structure that contains two-dimensional data
dataset = pd.DataFrame(dataset)
# Print dataset and a new line
print(dataset)
print('\n')

# The review entries of the dataset is denoted by “X”, and the rating is denoted by “y”
X = dataset.review
y = dataset.rating

# The data for X and y, will be split into the training and testing data
# The training data will be used to extract features and to train to fit the model
# The test size will be 0.25 or 25% and the random state can be set to any number, in this case 123
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=123)

# The count vectorizer is used to compute the number of words in a given sample
vec = CountVectorizer()
# The fit transform is used by the training set to do both calculation and transformation
X_train = vec.fit_transform(X_train)
# Transforms test data
X_test = vec.transform(X_test)

# Naive Bayes Classifier is being utilized
nbc = MultinomialNB()
# Classifier fits to training data
nbc.fit(X_train, y_train)

# The prediction array is forecasted using the testing data after train data finished ttraining
prediction = nbc.predict(X_test)
accuracy = nbc.score(X_test, y_test)

# Print algorithm name; classification report, which include precision, recall, and f1-score; and the accuracy score of the algorithm
print('Algorithm Name: Naive Bayes \n')
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))
print('Accuracy Score: ', accuracy_score(y_test, prediction) * 100, '%', sep='')

                                                 review  rating
0     This case is just beautiful. I can't think of ...     1.0
1     My husband purchased these because he likes mo...     1.0
2     Very disappointed.  This item worked a time or...     0.0
3     ...first of all, this Lightning cable does exa...     1.0
4     Very bad, slow, flakey software. Very slow. I ...     0.0
...                                                 ...     ...
3995  I had this thing connected to my radio for qui...     0.0
3996  This unique internet radio was easy to set up,...     1.0
3997  we're pretty confident this is a bootlegged de...     0.0
3998  Wish I could say that this keyboard works for ...     0.0
3999  The unit simply will not allow us to install i...     0.0

[4000 rows x 2 columns]


Algorithm Name: Naive Bayes 

[[447  63]
 [ 70 420]]
              precision    recall  f1-score   support

         0.0       0.86      0.88      0.87       510
         1.0       0.87      0.86      0.86

In [None]:
import numpy as np
import pandas as pd

In [None]:
fname = '/content/dataset_elec_4000.csv'
dataset = pd.read_csv(fname, delimiter = ',')
dataset = pd.DataFrame(dataset)
print(dataset)

                                                 review  rating
0     This case is just beautiful. I can't think of ...     1.0
1     My husband purchased these because he likes mo...     1.0
2     Very disappointed.  This item worked a time or...     0.0
3     ...first of all, this Lightning cable does exa...     1.0
4     Very bad, slow, flakey software. Very slow. I ...     0.0
...                                                 ...     ...
3995  I had this thing connected to my radio for qui...     0.0
3996  This unique internet radio was easy to set up,...     1.0
3997  we're pretty confident this is a bootlegged de...     0.0
3998  Wish I could say that this keyboard works for ...     0.0
3999  The unit simply will not allow us to install i...     0.0

[4000 rows x 2 columns]


In [None]:
X = dataset.review
y = dataset.rating

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=123)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
prediction = nbc.predict(X_test)
accuracy = nbc.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print('Algorithm Name: Naive Bayes \n')
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))
print('Accuracy Score: ', accuracy_score(y_test, prediction) * 100, '%', sep='')

Algorithm Name: Naive Bayes 

[[441  68]
 [ 78 413]]
              precision    recall  f1-score   support

         0.0       0.85      0.87      0.86       509
         1.0       0.86      0.84      0.85       491

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000

Accuracy Score: 85.39999999999999%
