In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go

In [2]:
data = pd.read_pickle('..\Data\After_dropping_duplicates.pkl')

In [3]:
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


## Fixing Date Column Type

In [4]:
data['date'] = pd.to_datetime(data['date'] , format = '%d-%b-%y' , errors = 'coerce')

In [5]:
data['month'] = data['date'].dt.strftime('%B')

In [6]:
grouped_date = data.groupby('month').count().reset_index()

In [7]:
fig = px.bar(grouped_date , x = 'month' ,category_orders= dict(month=["May", "June", "July"]), y = 'feedback' , title = 'Feedbacks per month' , color_discrete_sequence = ['indianred'] , text_auto= True)
fig.update_layout(bargap = 0.3)

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def removing_panctuactions(text):
    out = [char for char in text if char not in string.punctuation]
    return ''.join(out)

In [10]:
# Removing panctuactions from the reviews
data['verified_reviews'] = data['verified_reviews'].apply(removing_panctuactions)

In [11]:
import nltk # Natural Language tool kit 

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Seif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
# Removing stopwords
def removing_stopwords(text):
    out = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return out

In [14]:
data['verified_reviews'] = data['verified_reviews'].apply(removing_stopwords)

In [15]:
#removing white spaces in each review
def removing_white_spaces(text):
    out = [word for word in text if word != '']
    return out

In [16]:
data['verified_reviews'] = data['verified_reviews'].apply(removing_white_spaces)

## Dropping the unnecessary columns

In [17]:
data.columns

Index(['rating', 'date', 'variation', 'verified_reviews', 'feedback', 'month'], dtype='object')

In [18]:
data.drop(['rating' , 'date' , 'month'] , axis = 1 , inplace = True)

## Encoding variations column

In [19]:
from category_encoders import BinaryEncoder
temp = data['variation']
encoder = BinaryEncoder()
temp = encoder.fit_transform(temp)

In [20]:
temp

Unnamed: 0,variation_0,variation_1,variation_2,variation_3,variation_4
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
2430,0,1,1,1,1
2431,1,0,0,0,0
2432,0,1,1,1,1
2433,0,1,1,1,1


In [21]:
data.drop('variation' , axis = 1 , inplace = True)

In [22]:
data = pd.concat([data , temp] , axis = 1)

In [23]:
data

Unnamed: 0,verified_reviews,feedback,variation_0,variation_1,variation_2,variation_3,variation_4
0,"[Love, Echo]",1,0,0,0,0,1
1,[Loved],1,0,0,0,0,1
2,"[Sometimes, playing, game, answer, question, c...",1,0,0,0,1,0
3,"[lot, fun, thing, 4, yr, old, learns, dinosaur...",1,0,0,0,0,1
4,[Music],1,0,0,0,0,1
...,...,...,...,...,...,...,...
2430,"[love, things, running, entire, home, TV, ligh...",1,0,1,1,1,1
2431,"[complaint, sound, quality, isnt, great, mostl...",1,1,0,0,0,0
2432,[Good],1,0,1,1,1,1
2433,"[Nice, little, unit, issues]",1,0,1,1,1,1


## Lemmitizing the text

In [24]:
from nltk.stem import WordNetLemmatizer
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    l =[]
    for i in range(len(text)):
        text[i] = text[i].lower()
        word = text[i]
        lemmatized_word = lemmatizer.lemmatize(word, pos='v')
        l.append(lemmatized_word)
    return l


In [25]:
data['verified_reviews'] = data['verified_reviews'].apply(lemmatization)
data['verified_reviews'] = data['verified_reviews'].apply(' '.join)

In [26]:
data

Unnamed: 0,verified_reviews,feedback,variation_0,variation_1,variation_2,variation_3,variation_4
0,love echo,1,0,0,0,0,1
1,love,1,0,0,0,0,1
2,sometimes play game answer question correctly ...,1,0,0,0,1,0
3,lot fun thing 4 yr old learn dinosaurs control...,1,0,0,0,0,1
4,music,1,0,0,0,0,1
...,...,...,...,...,...,...,...
2430,love things run entire home tv light thermosta...,1,0,1,1,1,1
2431,complaint sound quality isnt great mostly use ...,1,1,0,0,0,0
2432,good,1,0,1,1,1,1
2433,nice little unit issue,1,0,1,1,1,1


In [27]:
pd.to_pickle(data , '..\Data\After_preprocessing.pkl')