In [159]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h1>Load "fakeReviewData.csv" file</h1>

In [160]:
df = pd.read_csv('/content/fakeReviewData.csv')

In [161]:
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


<h1>Check missing values</h1>

In [162]:
df.isnull().sum()

Unnamed: 0,0
category,0
rating,0
label,0
text_,0


Missing values are not there !

<h1>Check duplicates values</h1>

In [163]:
df.duplicated().sum()

12

In [164]:
df[df.duplicated()]

Unnamed: 0,category,rating,label,text_
6025,Sports_and_Outdoors_5,5.0,CG,"This is a really good starter kit, with lots o..."
6708,Sports_and_Outdoors_5,5.0,CG,"Great, no complaints. Comfortable, phone fits ..."
12548,Movies_and_TV_5,5.0,CG,One of the best movies of the year. Not for e...
19802,Pet_Supplies_5,5.0,CG,My dog loves these and it has kept her occupie...
20242,Pet_Supplies_5,5.0,CG,My dog loves it and it has kept her occupied f...
22305,Pet_Supplies_5,5.0,OR,Got these to give to my 8 mth old chihuahua wh...
26444,Kindle_Store_5,5.0,CG,I received this story as an ARC in exchange fo...
27233,Kindle_Store_5,5.0,CG,This is the first book in a series by the auth...
29203,Books_5,5.0,CG,I really enjoyed this book. The characters wer...
33602,Toys_and_Games_5,5.0,CG,I got this for my son for Christmas. He loved...


12 duplicate values are there but they are not fully duplicate.
so we can not REMOVE it.

<h1>Normalization Part</h1>

In [165]:
df['text_'] = df['text_'].str.lower()

df['text_'] = df['text_'].str.replace(r'[^a-z\s]', '', regex=True)

In [166]:
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,i had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,i wasnt sure exactly what it would be it is a ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,you can wear the hood by itself wear it with t...
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,i liked nothing about this dress the only reas...


<h1>Break Down sentence into words.</h1>

In [167]:
df['tokens'] = df['text_'].apply(word_tokenize)

In [168]:
df

Unnamed: 0,category,rating,label,text_,tokens
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"[love, this, well, made, sturdy, and, very, co..."
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"[love, it, a, great, upgrade, from, the, origi..."
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"[this, pillow, saved, my, back, i, love, the, ..."
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"[missing, information, on, how, to, use, it, b..."
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"[very, nice, set, good, quality, we, have, had..."
...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,i had read some reviews saying that this bra r...,"[i, had, read, some, reviews, saying, that, th..."
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,i wasnt sure exactly what it would be it is a ...,"[i, wasnt, sure, exactly, what, it, would, be,..."
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,you can wear the hood by itself wear it with t...,"[you, can, wear, the, hood, by, itself, wear, ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,i liked nothing about this dress the only reas...,"[i, liked, nothing, about, this, dress, the, o..."


<h1>Stopword Removal</h1>

In [169]:
stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

<h1>Stemming/Lemmatization</h1>

In [170]:
stemmer = PorterStemmer()
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

<h1>Vectorization</h1>

In [171]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['text_'])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

<h1>csv file download here!</h1>

In [172]:
df.to_csv('/content/Fake_Review_Detection_Preprocessing_Review.csv', index=False)