In [1]:
# imports necessary libraries/packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# sets the name of the file
FILENAME = 'fake reviews dataset.csv'

# reads the csv file into a pandas df
df = pd.read_csv(FILENAME)
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


In [3]:
# creates a column for the labels (0 if the review is fake (CG) and 1 if the review is real (OR))
df['labels'] = [1 if label=='OR' else 0 for label in df['label']]
df

Unnamed: 0,category,rating,label,text_,labels
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",0
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",0
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,0
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",0
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,0
...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,1
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,0
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",1
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,0


In [4]:
# splits the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_'], df['labels'], test_size=0.2, random_state=42)

In [5]:
# converts the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(X_train)
test_features = vectorizer.transform(X_test)

test_features

<8087x36975 sparse matrix of type '<class 'numpy.int64'>'
	with 331065 stored elements in Compressed Sparse Row format>

In [6]:
# trains the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_features, y_train)

In [7]:
# makes predictions on the test set
predictions = naive_bayes_classifier.predict(test_features)

# displays the full content of the predictions
np.set_printoptions(threshold=np.inf)

print(predictions)

[0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0
 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 1
 1 0 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0
 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0
 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0
 0 1 0 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1
 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0
 1 1 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1
 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0
 1 1 0 1 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 1 1 1 1 0 1 0 0 1 0 1 0 0 0
 0 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1
 1 1 1 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1
 0 0 0 0 0 1 0 0 0 1 0 1 

In [8]:
# evaluates the performance metrics
accuracy = accuracy_score(y_test, predictions)
confusion_mat = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print('accuracy:', accuracy)
print('confusion_matrix:', confusion_mat)
print('classification_report:', classification_rep)

accuracy: 0.8587857054531964
confusion_matrix: [[3651  365]
 [ 777 3294]]
classification_report:               precision    recall  f1-score   support

           0       0.82      0.91      0.86      4016
           1       0.90      0.81      0.85      4071

    accuracy                           0.86      8087
   macro avg       0.86      0.86      0.86      8087
weighted avg       0.86      0.86      0.86      8087

