# Name: Madhav Kanjilimadom

# Student ID: 202203018

# Colab Link: https://colab.research.google.com/drive/1sNtvx6L0VhFJLzv1VFKNaej6PNizXiTi?usp=sharing

# 0. Importing necessary libraries

In [48]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn
from google.colab import files

# 1. Loading and reading the data

In [35]:
url = 'https://raw.githubusercontent.com/Maddy-git-3018/Project_WoC_7.0_Fake_Review_Detection/refs/heads/main/fakeReviewData.csv'
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,category,rating,label,text_
32419,Books_5,5.0,CG,It is two days before Christmas and the holida...
18080,Tools_and_Home_Improvement_5,1.0,CG,I've tried a set of these on the market and th...
28762,Books_5,3.0,CG,I would have like to know more about the relat...
28665,Books_5,5.0,OR,Another great read by Ms. Steel. From start t...
19328,Tools_and_Home_Improvement_5,3.0,CG,"I had a partial success with this, as it worke..."


# 2. Exploring the data

In [36]:
df.isnull().sum()

Unnamed: 0,0
category,0
rating,0
label,0
text_,0


In [37]:
df.shape

(40432, 4)

In [38]:
df.nunique()

Unnamed: 0,0
category,10
rating,5
label,2
text_,40412


# 3. Data Cleaning

## 3.1 Deleting duplicate reviews

In [39]:
df = df.drop_duplicates(subset=df.columns[3])

## 3.2 Removing puncuation, special characters and numbers

In [40]:
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: re.sub(r'\d+', '', str(x)))
df.sample(5)

Unnamed: 0,category,rating,label,text_
24941,Kindle_Store_5,1.0,OR,Typically this author has a better end product...
30842,Books_5,5.0,CG,This was a good saga a good story a good story...
6236,Sports_and_Outdoors_5,3.0,CG,These are quite a bit more expensive than the ...
23148,Pet_Supplies_5,4.0,CG,div idvideoblockRFJIURUJI classasection aspaci...
31700,Books_5,5.0,CG,If you are interested in reading an academic w...


## 3.3 Removing stopwords

In [41]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: [word for word in word_tokenize(str(x)) if word.lower() not in stop_words])
df.sample(5)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category,rating,label,text_
5008,Sports_and_Outdoors_5,1.0,OR,"[Ordered, x, thats, shirts, says, bigger, larg..."
8905,Electronics_5,5.0,CG,"[stated, Bluetooth, sync, issue, Pioneer, Pion..."
20664,Pet_Supplies_5,5.0,CG,"[best, tool, ever, used, continue, use, long]"
24134,Kindle_Store_5,5.0,OR,"[Another, great, story, master, story, teller,..."
21398,Pet_Supplies_5,5.0,CG,"[got, dog, good, size, dog, loves, best, small..."


## 3.4 Tokenization of words

In [42]:
df.iloc[:, 3] = df.iloc[:, 3].apply(lambda x: word_tokenize(str(x)))
df.sample(5)

Unnamed: 0,category,rating,label,text_
11276,Electronics_5,5.0,OR,"[[, 'dont, ', ,, 'much, ', ,, 'care, ', ,, 're..."
36608,Clothing_Shoes_and_Jewelry_5,3.0,CG,"[[, 'listened, ', ,, 'everyone, ', ,, 'else, '..."
3983,Home_and_Kitchen_5,5.0,OR,"[[, 'try, ', ,, 'eat, ', ,, 'lot, ', ,, 'less,..."
19884,Pet_Supplies_5,4.0,CG,"[[, 'Adaptable, ', ,, 'comfortable, ', ,, 'eas..."
23253,Pet_Supplies_5,4.0,CG,"[[, 'cats, ', ,, 'absolutely, ', ,, 'love, ', ..."


## 3.5 Lemmatizing words

In [43]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(str(text))
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

df.iloc[:, 3] = df.iloc[:, 3].apply(lemmatize_text)
df.sample(5)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,category,rating,label,text_
14229,Movies_and_TV_5,5.0,OR,"[ ' [ ' , `` 'item '' , `` ' '' , ' , ' , `` '..."
30259,Books_5,4.0,CG,"[ ' [ ' , `` 'good '' , `` ' '' , ' , ' , `` '..."
28371,Kindle_Store_5,4.0,OR,"[ ' [ ' , `` 'Wedding '' , `` ' '' , ' , ' , `..."
23374,Pet_Supplies_5,5.0,CG,"[ ' [ ' , `` 'collar '' , `` ' '' , ' , ' , ``..."
15956,Tools_and_Home_Improvement_5,5.0,CG,"[ ' [ ' , `` 'Quite '' , `` ' '' , ' , ' , `` ..."


# 4. Vectorization using TF-ID

In [44]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df.iloc[:, 3])

In [45]:
print(tfidf_matrix.shape)
print(df.shape)

(40412, 48312)
(40412, 4)


In [46]:
df.sample(5)

Unnamed: 0,category,rating,label,text_
5050,Sports_and_Outdoors_5,5.0,CG,"[ ' [ ' , `` 'truly '' , `` ' '' , ' , ' , `` ..."
25610,Kindle_Store_5,5.0,CG,"[ ' [ ' , `` 'loved '' , `` ' '' , ' , ' , `` ..."
39137,Clothing_Shoes_and_Jewelry_5,3.0,CG,"[ ' [ ' , `` 'hook '' , `` ' '' , ' , ' , `` '..."
29120,Books_5,5.0,OR,"[ ' [ ' , `` 'Loved '' , `` ' '' , ' , ' , `` ..."
34758,Toys_and_Games_5,5.0,CG,"[ ' [ ' , `` 'year '' , `` ' '' , ' , ' , `` '..."


In [47]:
print(tfidf_matrix)

  (0, 24803)	0.4477909325702159
  (0, 46692)	0.22705209727075773
  (0, 25159)	0.2980193494348967
  (0, 40969)	0.3613839022446882
  (0, 8024)	0.32591326783039304
  (0, 21991)	0.5571060609520052
  (0, 32565)	0.33455963159996505
  (1, 24803)	0.203577355351295
  (1, 17783)	0.172533720121145
  (1, 45150)	0.5456949149082749
  (1, 29697)	0.4068277496130651
  (1, 22002)	0.28397190750168494
  (1, 26647)	0.43864426827126274
  (1, 9126)	0.3139572052159924
  (1, 47970)	0.3063962348870122
  (2, 24803)	0.16111205999842637
  (2, 31421)	0.7841161036855888
  (2, 36581)	0.4243881592881675
  (2, 2771)	0.24245245090229686
  (2, 24662)	0.23961250318417565
  (2, 14972)	0.250803085485961
  (3, 17783)	0.2098542228088051
  (3, 26847)	0.5917412421543853
  (3, 21124)	0.547579856381076
  (3, 45259)	0.2954594229092602
  :	:
  (40411, 41900)	0.08568946024408444
  (40411, 6389)	0.07743804598118159
  (40411, 15920)	0.17236127330623743
  (40411, 6660)	0.07023409542954767
  (40411, 12306)	0.05998428294605356
  (40411, 

# 5. Saving and downloading pre-processed data

In [50]:
df.to_csv('preprocessed_reviews.csv', index=False)
files.download('preprocessed_reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [51]:
matrix = pd.DataFrame(tfidf_matrix)
df.to_csv('tfidf_matrix.csv', index=False)
files.download('tfidf_matrix.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>