# Name: Madhav Kanjilimadom

# Student ID: 202203018

# Colab Link: https://colab.research.google.com/drive/1sNtvx6L0VhFJLzv1VFKNaej6PNizXiTi?usp=sharing

# 0. Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from google.colab import files

# 1. Loading and reading the data

In [2]:
url = 'https://raw.githubusercontent.com/Maddy-git-3018/Project_WoC_7.0_Fake_Review_Detection/refs/heads/main/fakeReviewData.csv'
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,category,rating,label,text_
13466,Movies_and_TV_5,5.0,OR,Happy I found a trilogy of TDK on DVD for such...
27449,Kindle_Store_5,5.0,CG,"This book has a little more of the ""real"" side..."
35733,Toys_and_Games_5,5.0,CG,This is the set that I bought for my son. He l...
7486,Sports_and_Outdoors_5,5.0,CG,This is a quality survival/combat knife. The o...
13657,Movies_and_TV_5,5.0,OR,"The video of Murder She Wrote, the complete 11..."


# 2. Exploring the data

In [3]:
df.isnull().sum()

Unnamed: 0,0
category,0
rating,0
label,0
text_,0


In [4]:
df.shape

(40432, 4)

In [5]:
df.nunique()

Unnamed: 0,0
category,10
rating,5
label,2
text_,40412


# 3. Data Cleaning

## 3.1 Deleting duplicate reviews

In [6]:
df = df.drop_duplicates(subset=df.columns[3])

## 3.2 Converting all text to lowecase, removing puncuation, special characters and numbers

In [7]:
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'\d+', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].str.lower()
df.sample(5)

Unnamed: 0,category,rating,label,text_
23942,Kindle_Store_5,5.0,CG,really enjoyed read and catching up with the c...
8132,Electronics_5,2.0,OR,its not so friendly but you get what you pay for
30092,Books_5,5.0,CG,i am a huge fan of the original series and thi...
27053,Kindle_Store_5,5.0,OR,thicker than water wow not just a saying bu...
31485,Books_5,5.0,CG,carl sagan is an astounding writer the story i...


## 3.3 Removing stopwords

In [8]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: [word for word in word_tokenize(str(x)) if word.lower() not in stop_words])
df.sample(5)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,category,rating,label,text_
2550,Home_and_Kitchen_5,4.0,CG,"[bought, four, daughter, loves, also, loves, s..."
5812,Sports_and_Outdoors_5,5.0,CG,"[love, little, organizer, ability, hold, two, ..."
22603,Pet_Supplies_5,5.0,CG,"[best, problem, kind, hard, put, bottom, top, ..."
26257,Kindle_Store_5,4.0,OR,"[absolutely, loved, book, lucky, might, needed..."
38782,Clothing_Shoes_and_Jewelry_5,5.0,OR,"[love, panties, perfect, fit, bought, beforewo..."


## 3.4 Lemmatizing words

In [9]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(str(text))
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

df.iloc[:, 3] = df.iloc[:, 3].apply(lemmatize_text)
df.sample(5)

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,category,rating,label,text_
27627,Kindle_Store_5,5.0,OR,"[ 'love ' , 'pp ' , 'continuance ' , 'stories ..."
5842,Sports_and_Outdoors_5,4.0,OR,"[ 'job ' , 'price ' , 'right ' , 'sure ' , 'hi..."
11335,Electronics_5,4.0,OR,"[ 'nice ' , 'micro ' , 'case ' , 'could ' , 'u..."
20670,Pet_Supplies_5,5.0,OR,"[ 'use ' , 'great ' , 'pyrenees ' , 'works ' ,..."
12383,Movies_and_TV_5,5.0,OR,"[ 'ive ' , 'watch ' , 'series ' , 'dont ' , 'k..."


# 4. Vectorization using TF-ID

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)

In [11]:
print(tfidf_matrix.shape)
print(df.shape)

(40412, 100)
(40412, 4)


# 5. Saving and downloading pre-processed data

In [39]:
df.to_csv('preprocessed_reviews.csv', index=False)
files.download('preprocessed_reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [40]:
matrix = pd.DataFrame(tfidf_matrix)
matrix.to_csv('tfidf_matrix.csv', index=False)
files.download('tfidf_matrix.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#6. Data Preparation for Model Fitting

In [12]:
df1 = df.drop(columns=['text_'])

In [13]:
df1['label_encoded'] = df1['label'].map({'CG': 0, 'OR': 1})
df1 = df1.drop(columns=['label'])

In [14]:
df1.sample(10)

Unnamed: 0,category,rating,label_encoded
26361,Kindle_Store_5,2.0,1
10380,Electronics_5,5.0,0
24808,Kindle_Store_5,5.0,1
37144,Clothing_Shoes_and_Jewelry_5,2.0,0
6775,Sports_and_Outdoors_5,5.0,1
21263,Pet_Supplies_5,5.0,1
9187,Electronics_5,5.0,0
8234,Electronics_5,2.0,0
37328,Clothing_Shoes_and_Jewelry_5,5.0,1
12457,Movies_and_TV_5,5.0,0


In [15]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, drop=None)  # Use sparse_output=False for a dense array
category_encoded = encoder.fit_transform(df1[['category']])
encoded_column_names = encoder.get_feature_names_out(['category'])
category_encoded_df = pd.DataFrame(category_encoded, columns=encoded_column_names)
df1 = pd.concat([df1.reset_index(drop=True), category_encoded_df.reset_index(drop=True)], axis=1)

In [16]:
df1 = df1.drop(columns=['category'])
df1.sample(10)

Unnamed: 0,rating,label_encoded,category_Books_5,category_Clothing_Shoes_and_Jewelry_5,category_Electronics_5,category_Home_and_Kitchen_5,category_Kindle_Store_5,category_Movies_and_TV_5,category_Pet_Supplies_5,category_Sports_and_Outdoors_5,category_Tools_and_Home_Improvement_5,category_Toys_and_Games_5
28432,5.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29765,4.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16182,5.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28672,5.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14519,1.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12148,5.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
24439,1.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
39413,3.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2345,5.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26812,2.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
combined_df = pd.concat([df1, tfidf_df], axis=1)
print(f"Combined DataFrame shape: {combined_df.shape}")

Combined DataFrame shape: (40412, 112)


In [18]:
combined_df.sample(10)

Unnamed: 0,rating,label_encoded,category_Books_5,category_Clothing_Shoes_and_Jewelry_5,category_Electronics_5,category_Home_and_Kitchen_5,category_Kindle_Store_5,category_Movies_and_TV_5,category_Pet_Supplies_5,category_Sports_and_Outdoors_5,...,using,want,way,wear,wish,work,works,written,year,years
7511,4.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.408722,0.0,0.225886,0.0
13372,5.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
286,4.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34213,5.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40090,5.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35644,5.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30107,2.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24690,4.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18453,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.31457,0.0,0.0,0.0
7240,5.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40412 entries, 0 to 40411
Columns: 112 entries, rating to years
dtypes: float64(111), int64(1)
memory usage: 34.5 MB


#7. Data splitting for Model Selection

In [20]:
from sklearn.model_selection import train_test_split

X = combined_df.drop(columns=['label_encoded'])
y = combined_df['label_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (32329, 111)
Test set shape: (8083, 111)


##7.1 Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("Model Performance Metrics:")
print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Model Performance Metrics:
Accuracy:  0.74
Precision: 0.73
Recall:    0.75
F1 Score:  0.74


###7.1.1 Saving the Random Forest Model

In [37]:
from joblib import dump, load
dump(rf_model, 'rf_model.joblib')

['rf_model.joblib']

##7.2 Support Vector Machines (SVM)

In [42]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred = svm_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("Model Performance Metrics:")
print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Model Performance Metrics:
Accuracy:  0.76
Precision: 0.74
Recall:    0.80
F1 Score:  0.77


###7.2.1 Saving the SVM Model

In [34]:
dump(svm_model, 'svm_model.joblib')

['svm_model.joblib']

##7.3 Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train_scaled, y_train)
y_pred = logreg_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("Model Performance Metrics:")
print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Model Performance Metrics:
Accuracy:  0.70
Precision: 0.70
Recall:    0.70
F1 Score:  0.70


###7.3.1 Saving the Logistic Regression Model

In [35]:
dump(logreg_model, 'logreg_model.joblib')

['logreg_model.joblib']

##7.4 Downloading all the saved models

In [38]:
from google.colab import files

files.download('rf_model.joblib')
files.download('svm_model.joblib')
files.download('logreg_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>