# Name: Madhav Kanjilimadom

# Student ID: 202203018

# Colab Link: https://colab.research.google.com/drive/1sNtvx6L0VhFJLzv1VFKNaej6PNizXiTi?usp=sharing

# 0. Importing necessary libraries

In [52]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from google.colab import files

# 1. Loading and reading the data

In [53]:
url = 'https://raw.githubusercontent.com/Maddy-git-3018/Project_WoC_7.0_Fake_Review_Detection/refs/heads/main/fakeReviewData.csv'
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,category,rating,label,text_
38402,Clothing_Shoes_and_Jewelry_5,4.0,CG,"Love them, fit well, my daughter loves them.Ve..."
28764,Books_5,4.0,CG,Basic book about biology. Easy to read and und...
4591,Sports_and_Outdoors_5,5.0,CG,Great knife. I'm not used to carrying knives w...
26602,Kindle_Store_5,5.0,CG,"I wasn't sure if I would like the story, but I..."
2720,Home_and_Kitchen_5,5.0,CG,Only thing I don't like is the suction on the ...


# 2. Exploring the data

In [54]:
df.isnull().sum()

Unnamed: 0,0
category,0
rating,0
label,0
text_,0


In [55]:
df.shape

(40432, 4)

In [56]:
df.nunique()

Unnamed: 0,0
category,10
rating,5
label,2
text_,40412


# 3. Data Cleaning

## 3.1 Deleting duplicate reviews

In [57]:
df = df.drop_duplicates(subset=df.columns[3])

## 3.2 Removing puncuation, special characters and numbers

In [58]:
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'\d+', '', str(x)))
df.sample(5)

Unnamed: 0,category,rating,label,text_
4347,Sports_and_Outdoors_5,5.0,CG,Makes an amazing difference on my back and nec...
19924,Pet_Supplies_5,5.0,OR,I am now a onelitter box home All the cats use...
1276,Home_and_Kitchen_5,5.0,CG,Thick aluminum bowl with welldefined suction c...
21456,Pet_Supplies_5,5.0,OR,Excellent product It wraps around the dog prot...
17337,Tools_and_Home_Improvement_5,5.0,OR,this was a great product at a really good pric...


## 3.3 Removing stopwords

In [59]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: [word for word in word_tokenize(str(x)) if word.lower() not in stop_words])
df.sample(5)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category,rating,label,text_
7403,Sports_and_Outdoors_5,5.0,CG,"[purchased, wear, Taurus, fit, perfect, comfor..."
31712,Books_5,5.0,CG,"[Full, disclosure, little, disappointed, endin..."
24279,Kindle_Store_5,5.0,CG,"[loved, want, characters, well, developed, kep..."
17771,Tools_and_Home_Improvement_5,5.0,CG,"[Perfect, cabinet, saw, problem, comes, small,..."
9228,Electronics_5,5.0,CG,"[Took, min, find, replacement, Ive, mine, week..."


## 3.4 Lemmatizing words

In [60]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(str(text))
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

df.iloc[:, 3] = df.iloc[:, 3].apply(lemmatize_text)
df.sample(5)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,category,rating,label,text_
21254,Pet_Supplies_5,5.0,CG,"[ 'bought ' , 'dog ' , 'loves ' , 'also ' , 'l..."
23833,Kindle_Store_5,5.0,CG,"[ 'Iv ' , 'read ' , 'book ' , 'characters ' , ..."
39697,Clothing_Shoes_and_Jewelry_5,1.0,OR,"[ 'low ' , 'quality ' , 'resemble ' , 'picture..."
36253,Toys_and_Games_5,4.0,CG,"[ 'purchased ' , 'toy ' , 'hoping ' , 'would '..."
2801,Home_and_Kitchen_5,2.0,OR,"[ 'title ' , 'states ' , 'job ' , 'decently ' ..."


# 4. Vectorization using TF-ID

In [61]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df.iloc[:, 3])

In [62]:
print(tfidf_matrix.shape)
print(df.shape)

(40412, 48312)
(40412, 4)


In [63]:
df.sample(5)

Unnamed: 0,category,rating,label,text_
24769,Kindle_Store_5,5.0,CG,"[ 'suspense ' , 'awesomecharacters ' , 'made '..."
16476,Tools_and_Home_Improvement_5,5.0,OR,"[ 'Thank ' , 'listing ' , 'buy ' , 'one ' , 'o..."
17991,Tools_and_Home_Improvement_5,5.0,CG,"[ 'Finally ' , 'Cree ' , 'makes ' , ' W ' , 'b..."
12295,Movies_and_TV_5,5.0,CG,"[ 'really ' , 'enjoyed ' , 'series ' , 'love '..."
6782,Sports_and_Outdoors_5,5.0,CG,"[ 'Best ' , 'sunglasses ' , 'Ive ' , 'ever ' ,..."


In [64]:
print(tfidf_matrix)

  (0, 24803)	0.4477909325702159
  (0, 46692)	0.22705209727075773
  (0, 25159)	0.2980193494348967
  (0, 40969)	0.3613839022446882
  (0, 8024)	0.32591326783039304
  (0, 21991)	0.5571060609520052
  (0, 32565)	0.33455963159996505
  (1, 24803)	0.203577355351295
  (1, 17783)	0.172533720121145
  (1, 45150)	0.5456949149082749
  (1, 29697)	0.4068277496130651
  (1, 22002)	0.28397190750168494
  (1, 26647)	0.43864426827126274
  (1, 9126)	0.3139572052159924
  (1, 47970)	0.3063962348870122
  (2, 24803)	0.16111205999842637
  (2, 31421)	0.7841161036855888
  (2, 36581)	0.4243881592881675
  (2, 2771)	0.24245245090229686
  (2, 24662)	0.23961250318417565
  (2, 14972)	0.250803085485961
  (3, 17783)	0.2098542228088051
  (3, 26847)	0.5917412421543853
  (3, 21124)	0.547579856381076
  (3, 45259)	0.2954594229092602
  :	:
  (40411, 41900)	0.08568946024408444
  (40411, 6389)	0.07743804598118159
  (40411, 15920)	0.17236127330623743
  (40411, 6660)	0.07023409542954767
  (40411, 12306)	0.05998428294605356
  (40411, 

# 5. Saving and downloading pre-processed data

In [65]:
df.to_csv('preprocessed_reviews.csv', index=False)
files.download('preprocessed_reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [66]:
matrix = pd.DataFrame(tfidf_matrix)
df.to_csv('tfidf_matrix.csv', index=False)
files.download('tfidf_matrix.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>