In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Monson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Monson\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
df = pd.read_csv('fakeReviewData.csv')
df.head(3)

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...


## Some basic EDA

In [5]:
df.shape

(40432, 4)

In [6]:
df.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [8]:
df.describe()

Unnamed: 0,rating
count,40432.0
mean,4.256579
std,1.144354
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


## Handling Missing Values (if any)

In [9]:
df.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

### There are no missing values

## Lets check for duplicates

In [10]:
duplicated_rows = df[df.duplicated()]
duplicated_rows

Unnamed: 0,category,rating,label,text_
6025,Sports_and_Outdoors_5,5.0,CG,"This is a really good starter kit, with lots o..."
6708,Sports_and_Outdoors_5,5.0,CG,"Great, no complaints. Comfortable, phone fits ..."
12548,Movies_and_TV_5,5.0,CG,One of the best movies of the year. Not for e...
19802,Pet_Supplies_5,5.0,CG,My dog loves these and it has kept her occupie...
20242,Pet_Supplies_5,5.0,CG,My dog loves it and it has kept her occupied f...
22305,Pet_Supplies_5,5.0,OR,Got these to give to my 8 mth old chihuahua wh...
26444,Kindle_Store_5,5.0,CG,I received this story as an ARC in exchange fo...
27233,Kindle_Store_5,5.0,CG,This is the first book in a series by the auth...
29203,Books_5,5.0,CG,I really enjoyed this book. The characters wer...
33602,Toys_and_Games_5,5.0,CG,I got this for my son for Christmas. He loved...


In [11]:
duplicated_rows.shape[0]

12

### There are 12 duplicate rows

In [12]:
df = df.drop_duplicates()

In [13]:
df[df.duplicated()]

Unnamed: 0,category,rating,label,text_


In [14]:
df.shape

(40420, 4)

### Now we have successfully removed duplicate rows

## Normalizing the Data

### Converting to lowercase, removing special characters, punctuation and numbers

In [15]:
df['text_'] = df['text_'].str.lower() 
df['text_'] = df['text_'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
df['text_'].head(5)

0    love this  well made sturdy and very comfortab...
1    love it a great upgrade from the original  ive...
2    this pillow saved my back i love the look and ...
3    missing information on how to use it but it is...
4    very nice set good quality we have had the set...
Name: text_, dtype: object

## Tokenization

In [16]:
nltk.download('punkt_tab')
df['text_'] = df['text_'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Monson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [17]:
df['text_']

0        [love, this, well, made, sturdy, and, very, co...
1        [love, it, a, great, upgrade, from, the, origi...
2        [this, pillow, saved, my, back, i, love, the, ...
3        [missing, information, on, how, to, use, it, b...
4        [very, nice, set, good, quality, we, have, had...
                               ...                        
40427    [i, had, read, some, reviews, saying, that, th...
40428    [i, wasnt, sure, exactly, what, it, would, be,...
40429    [you, can, wear, the, hood, by, itself, wear, ...
40430    [i, liked, nothing, about, this, dress, the, o...
40431    [i, work, in, the, wedding, industry, and, hav...
Name: text_, Length: 40420, dtype: object

## Removing stop-words

### These are words that do not contribute much to the overall meaning of the sentence, we tend to remove those

In [228]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['text_'] = df['text_'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Monson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [229]:
df['text_'].size, df['text_']

(40420,
 0        [love, well, made, sturdy, comfortable, love, ...
 1        [love, great, upgrade, original, ive, mine, co...
 2          [pillow, saved, back, love, look, feel, pillow]
 3        [missing, information, use, great, product, pr...
 4             [nice, set, good, quality, set, two, months]
                                ...                        
 40427    [read, reviews, saying, bra, ran, small, order...
 40428    [wasnt, sure, exactly, would, little, large, s...
 40429    [wear, hood, wear, hood, wear, jacket, without...
 40430    [liked, nothing, dress, reason, gave, stars, o...
 40431    [work, wedding, industry, work, long, days, fe...
 Name: text_, Length: 40420, dtype: object)

## Stemming and Lemmetization

### eg. "running" -> "run"

In [230]:
lemmatizer = WordNetLemmatizer()

df['text_'] = df['text_'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

## Tokenization using TF-IDF

In [231]:
df['cleaned_reviews'] = df['text_'].apply(lambda x: ' '.join(x))

vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(df['cleaned_reviews'])

In [232]:
df['tokenised_reviews'] = list(X.toarray())
df['tokenised_reviews']

0        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                               ...                        
40427    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08901854...
40428    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
40429    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
40430    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
40431    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04440132...
Name: tokenised_reviews, Length: 40420, dtype: object

In [233]:
df.sample(5)

Unnamed: 0,category,rating,label,text_,cleaned_reviews,tokenised_reviews
24922,Kindle_Store_5,4.0,OR,"[book, l, short, erotic, read, enjoy, short, r...",book l short erotic read enjoy short reading p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9107,Electronics_5,5.0,CG,"[used, canon, eos, rebel, ti, canon, rebel, ti...",used canon eos rebel ti canon rebel ti ii replace,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
878,Home_and_Kitchen_5,5.0,CG,"[actually, use, makeshift, suction, cup, also,...",actually use makeshift suction cup also love r...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5782,Sports_and_Outdoors_5,5.0,OR,"[fit, like, glove, year, old, fishing, boat, t...",fit like glove year old fishing boat took five...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6659,Sports_and_Outdoors_5,5.0,OR,"[easy, set, easy, play, played, year, old, eve...",easy set easy play played year old everyone pl...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [235]:
df.to_csv('preprocessed_dataset.csv', index=False)