# Name: Madhav Kanjilimadom

# Student ID: 202203018

# Colab Link: https://colab.research.google.com/drive/1sNtvx6L0VhFJLzv1VFKNaej6PNizXiTi?usp=sharing

# 0. Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from google.colab import files

# 1. Loading and reading the data

In [2]:
url = 'https://raw.githubusercontent.com/Maddy-git-3018/Project_WoC_7.0_Fake_Review_Detection/refs/heads/main/fakeReviewData.csv'
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,category,rating,label,text_
8229,Electronics_5,5.0,OR,Gets very warm if you plug in multiple devices...
21191,Pet_Supplies_5,5.0,CG,I have this pod hanging on a wall with a latch...
37437,Clothing_Shoes_and_Jewelry_5,3.0,OR,Nice hat but runs very small. Wish I hadn't r...
16904,Tools_and_Home_Improvement_5,5.0,CG,This is the best shoulder strap I have ever ow...
13602,Movies_and_TV_5,5.0,OR,Apart from tearing down the government like a ...


# 2. Exploring the data

In [3]:
df.isnull().sum()

Unnamed: 0,0
category,0
rating,0
label,0
text_,0


In [4]:
df.shape

(40432, 4)

In [5]:
df.nunique()

Unnamed: 0,0
category,10
rating,5
label,2
text_,40412


# 3. Data Cleaning

## 3.1 Deleting duplicate reviews

In [6]:
df = df.drop_duplicates(subset=df.columns[3])

## 3.2 Converting all text to lowecase, removing puncuation, special characters and numbers

In [7]:
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'\d+', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].str.lower()
df.sample(5)

Unnamed: 0,category,rating,label,text_
5596,Sports_and_Outdoors_5,5.0,OR,wish they were a bit cheaper but otherwise the...
31870,Books_5,4.0,OR,i really enjoy the multiple story lines in thi...
26269,Kindle_Store_5,4.0,CG,while its not the sort of book you want to put...
1545,Home_and_Kitchen_5,5.0,CG,again smiles on the faces of my daughter her ...
776,Home_and_Kitchen_5,5.0,OR,i gave it five stars you want me to leave a co...


## 3.3 Removing stopwords

In [8]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: [word for word in word_tokenize(str(x)) if word.lower() not in stop_words])
df.sample(5)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,category,rating,label,text_
5687,Sports_and_Outdoors_5,5.0,OR,"[love, case, motorola, razr, phone, sturdy, ca..."
22594,Pet_Supplies_5,5.0,OR,"[excellent, pet, tags, ive, ordered, three, do..."
39220,Clothing_Shoes_and_Jewelry_5,5.0,OR,"[terrific, slacks, phenomenal, price, tried, v..."
2420,Home_and_Kitchen_5,5.0,OR,"[love, bottles, make, coffee, morning, bring, ..."
18046,Tools_and_Home_Improvement_5,5.0,OR,"[needed, give, old, mini, maglite, new, life, ..."


## 3.4 Lemmatizing words

In [9]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(str(text))
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

df.iloc[:, 3] = df.iloc[:, 3].apply(lemmatize_text)
df.sample(5)

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,category,rating,label,text_
22387,Pet_Supplies_5,5.0,CG,"[ 'nice ' , 'collar ' , 'design ' , 'good ' , ..."
10422,Electronics_5,1.0,CG,"[ 'thing ' , 'super ' , 'low ' , 'cost ' , 'pr..."
40422,Clothing_Shoes_and_Jewelry_5,4.0,CG,"[ 'wore ' , 'pm ' , 'pm ' , 'perfect ' , 'reas..."
39233,Clothing_Shoes_and_Jewelry_5,5.0,CG,"[ 'beautiful ' , 'looks ' , 'nice ' , 'comfort..."
14159,Movies_and_TV_5,5.0,OR,"[ 'saw ' , 'ae ' , 'network ' , 'years ' , 'ag..."


# 4. Vectorization using TF-ID

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_'].fillna(''))
df['tfidf_vectors'] = list(tfidf_matrix.toarray())

In [11]:
print(tfidf_matrix.shape)
print(df.shape)

(40412, 50)
(40412, 5)


In [12]:
df.sample(5)

Unnamed: 0,category,rating,label,text_,tfidf_vectors
7763,Sports_and_Outdoors_5,5.0,OR,"[ 'never ' , 'able ' , 'skid ' , 'front ' , 't...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.291..."
35485,Toys_and_Games_5,3.0,OR,"[ 'sent ' , 'display ' , 'box ' , 'perfect ' ,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
11000,Electronics_5,5.0,CG,"[ 'love ' , 'keyboard ' , 'amazing ' , 'keyboa...","[0.0, 0.0, 0.0, 0.0, 0.4912028282123953, 0.0, ..."
4863,Sports_and_Outdoors_5,5.0,CG,"[ 'love ' , 'month ' , 'knee ' , 'pads ' , 'st...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13786,Movies_and_TV_5,4.0,OR,"[ 'hope ' , 'damion ' , 'dietz ' , 'director '...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
print(tfidf_matrix)

  (0, 24)	0.8085237742658159
  (0, 6)	0.5884635132673564
  (1, 24)	0.5224198668995236
  (1, 14)	0.4427557423851506
  (1, 17)	0.7287282314098023
  (2, 24)	1.0
  (3, 14)	0.3547190863667588
  (3, 45)	0.49941857328412614
  (3, 33)	0.5470976029730511
  (3, 32)	0.570473199411788
  (4, 29)	0.6121662638494876
  (4, 12)	0.4617677630039803
  (4, 34)	0.6418901763190175
  (6, 31)	0.7921271848030801
  (6, 20)	0.6103560625536105
  (7, 24)	0.5285708702053825
  (7, 14)	0.4479687754408458
  (7, 11)	0.7210664403509248
  (8, 14)	0.33448978048648365
  (8, 9)	0.49149903652471827
  (8, 35)	0.45604276757009576
  (8, 19)	0.4076581395949499
  (8, 40)	0.5219052780096878
  (9, 26)	1.0
  (10, 17)	0.768847917335908
  :	:
  (40410, 31)	0.09264521647017356
  (40410, 20)	0.21415716045606734
  (40410, 11)	0.27181311182781936
  (40410, 19)	0.13720339840369342
  (40410, 40)	0.4391374219763073
  (40410, 41)	0.16559966516346655
  (40410, 13)	0.09640151986551379
  (40410, 21)	0.09351502566966202
  (40410, 2)	0.170267312714

# 5. Saving and downloading pre-processed data

In [14]:
df.to_csv('preprocessed_reviews.csv', index=False)
files.download('preprocessed_reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
matrix = pd.DataFrame(tfidf_matrix)
matrix.to_csv('tfidf_matrix.csv', index=False)
files.download('tfidf_matrix.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>