### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

### Loading the data

In [2]:
df=pd.read_json("e-reviews.json",lines=True)

In [3]:
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
df['reviewText'][0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [5]:
#shape of the data tells the rows and columns
df.shape

(194439, 9)

In [6]:
#checking for any null values in user reviews
df.isnull().sum()

reviewerID           0
asin                 0
reviewerName      3519
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

In [8]:
#pulling out the user revies to a new dataframe
reviews=pd.DataFrame(df['reviewText'])

In [9]:
reviews

Unnamed: 0,reviewText
0,They look good and stick good! I just don't li...
1,These stickers work like the review says they ...
2,These are awesome and make my phone look so st...
3,Item arrived in great time and was in perfect ...
4,"awesome! stays on, and looks great. can be use..."
...,...
194434,Works great just like my original one. I reall...
194435,Great product. Great packaging. High quality a...
194436,"This is a great cable, just as good as the mor..."
194437,I really like it becasue it works well with my...


### Data Pre-processing

In [7]:
#removing non-apha numeric characters,stopwords,short words,whitespaces
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove stopwords
    lower=[word.lower().strip() for word in text.split()]
    stop_words = set(stopwords.words("english"))
    words = [word for word in lower if word not in stop_words]

    # Remove short words
    words = [word for word in words if len(word) > 2]

    return words

In [10]:
#applying the above function to user reviews
reviews['reviewText']=reviews["reviewText"].apply(clean_text)

In [11]:
reviews

Unnamed: 0,reviewText
0,"[look, good, stick, good, dont, like, rounded,..."
1,"[stickers, work, like, review, says, stick, gr..."
2,"[awesome, make, phone, look, stylish, used, on..."
3,"[item, arrived, great, time, perfect, conditio..."
4,"[awesome, stays, looks, great, used, multiple,..."
...,...
194434,"[works, great, like, original, one, really, ne..."
194435,"[great, product, great, packaging, high, quali..."
194436,"[great, cable, good, expensive, apple, one, hu..."
194437,"[really, like, becasue, works, well, life, pro..."


In [13]:
#initializing and applying lemmatizer
lemmatizer = WordNetLemmatizer()
reviews['reviewText']=reviews["reviewText"].apply(lambda tokens: " ".join([lemmatizer.lemmatize(word) for word in tokens]))

In [14]:
reviews

Unnamed: 0,reviewText
0,look good stick good dont like rounded shape a...
1,sticker work like review say stick great stay ...
2,awesome make phone look stylish used one far a...
3,item arrived great time perfect condition howe...
4,awesome stay look great used multiple apple pr...
...,...
194434,work great like original one really need extra...
194435,great product great packaging high quality app...
194436,great cable good expensive apple one husband d...
194437,really like becasue work well life proof case ...


In [15]:
#creating sentiment score over the processed data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [16]:
#adding positive and negative score for each row
reviews['pos_score']=reviews['reviewText'].apply(lambda x: analyser.polarity_scores(x)["pos"])
reviews['neg_score']=reviews['reviewText'].apply(lambda x: analyser.polarity_scores(x)["neg"])

In [17]:
reviews

Unnamed: 0,reviewText,pos_score,neg_score
0,look good stick good dont like rounded shape a...,0.223,0.277
1,sticker work like review say stick great stay ...,0.585,0.000
2,awesome make phone look stylish used one far a...,0.387,0.000
3,item arrived great time perfect condition howe...,0.439,0.000
4,awesome stay look great used multiple apple pr...,0.528,0.000
...,...,...,...
194434,work great like original one really need extra...,0.621,0.000
194435,great product great packaging high quality app...,0.552,0.049
194436,great cable good expensive apple one husband d...,0.365,0.076
194437,really like becasue work well life proof case ...,0.329,0.000


In [18]:
#calculating the final sentiment
reviews["sentiment"] = reviews[["pos_score", "neg_score"]].apply(lambda x: 0 if x[0] >= x[1] else 1, axis=1)

In [19]:
reviews

Unnamed: 0,reviewText,pos_score,neg_score,sentiment
0,look good stick good dont like rounded shape a...,0.223,0.277,1
1,sticker work like review say stick great stay ...,0.585,0.000,0
2,awesome make phone look stylish used one far a...,0.387,0.000,0
3,item arrived great time perfect condition howe...,0.439,0.000,0
4,awesome stay look great used multiple apple pr...,0.528,0.000,0
...,...,...,...,...
194434,work great like original one really need extra...,0.621,0.000,0
194435,great product great packaging high quality app...,0.552,0.049,0
194436,great cable good expensive apple one husband d...,0.365,0.076,0
194437,really like becasue work well life proof case ...,0.329,0.000,0


In [20]:
#value count of the sentiment 0--> positive and 1---> negative
reviews.sentiment.value_counts()

0    173868
1     20571
Name: sentiment, dtype: int64

#### creating a new csv file by taking equal amount of data in both sentiment categories

In [21]:
#pulling 5000 rows in each category
pos_df=reviews[reviews['sentiment']==0][:5000]
neg_df=reviews[reviews['sentiment']==1][:5000]

In [23]:
#concatting both datas row wise.
df1 = pd.concat([neg_df, pos_df], axis=0, ignore_index=True)

In [24]:
df1

Unnamed: 0,reviewText,pos_score,neg_score,sentiment
0,look good stick good dont like rounded shape a...,0.223,0.277,1
1,worked first week charge phone waste money,0.000,0.318,1
2,performs exactly advertised sturdily builtand ...,0.228,0.278,1
3,worked great first couple week stopped complet...,0.240,0.291,1
4,nothing special sure nice tell powered led glo...,0.000,0.470,1
...,...,...,...,...
9995,prosyou turn outlet theory get usb charging po...,0.107,0.054,0
9996,think best thing use travel kid used road kid ...,0.487,0.000,0
9997,ive wilson fan since day manufacturing yagi lo...,0.407,0.041,0
9998,little dinky antenna verizon wireless network ...,0.178,0.077,0


In [25]:
#shuffling the rows
df_new = df1.sample(frac=1).reset_index(drop=True)

In [26]:
df_new.head()

Unnamed: 0,reviewText,pos_score,neg_score,sentiment
0,vital accessory player love retractable dont l...,0.591,0.0,0
1,case fit well side volume button dont work top...,0.205,0.259,1
2,iphone broke bought use iphone come whenever u...,0.137,0.236,1
3,needed transmitter iphone jobthings dont like ...,0.0,0.363,1
4,cheap knock dont buy expecting one like one ca...,0.185,0.0,0


In [56]:
#saving the pre-processed data for training the model.
df_new.to_csv("cleaned.csv",index=False)