In [142]:
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [143]:
data = pd.read_csv('amazon_alexa.tsv', sep='\t')

In [144]:
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [145]:
data['feedback']

0       1
1       1
2       1
3       1
4       1
       ..
3145    1
3146    1
3147    1
3148    1
3149    1
Name: feedback, Length: 3150, dtype: int64

In [146]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [147]:
data.shape

(3150, 5)

In [148]:
data['feedback'].value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [149]:
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

In [150]:
nlp = spacy.load('en')
sw = STOP_WORDS
punctuation = string.punctuation

In [151]:
def cleaning(input):
    token = nlp(input)
    #token = [i.lemma_.lower() if i.lemma_ != '-PRON-' else i.lower() for i in token]
    token = [i for i in token if i.pos_!=['NOUN'] and len(i)>2]
    
    token = [i.lemma_.lower().strip() if i.lemma_ != '-PRON-' else i.lower_ for i in token]
    token = [i for i in token if i not in sw and i not in punctuation]
    
    token = ' '.join(token)
    token = re.sub('[^a-zA-Z]',' ',token)
    return token
    

In [152]:
data['verified_reviews'] = data['verified_reviews'].apply(cleaning)

In [153]:
data['verified_reviews'].head(10)

0                                            love echo
1                                                 love
2    play game answer question correctly alexa wron...
3    lot fun thing old learn dinosaur control light...
4                                                music
5    receive echo gift need bluetooth play music ea...
6    cellphone use feature ipad use great alarm dea...
7    think  th purchase work room house like featur...
8                                           look great
9    love listen song hear childhood news weather i...
Name: verified_reviews, dtype: object

In [154]:
tv = TfidfVectorizer(ngram_range=(1,1), lowercase=False)

In [155]:
tf=tv.fit_transform(data['verified_reviews'])

In [156]:
tf=pd.DataFrame(data = tf.toarray(), columns=tv.get_feature_names())
tf

Unnamed: 0,abay,abc,abd,ability,able,abode,absolutely,absolutly,accent,accept,...,you,young,youtube,yrs,yup,zero,zigbee,zonke,zzzz,zzzzzzz
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.229719,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3146,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3147,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3148,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,love echo,1
1,5,31-Jul-18,Charcoal Fabric,love,1
2,4,31-Jul-18,Walnut Finish,play game answer question correctly alexa wron...,1
3,5,31-Jul-18,Charcoal Fabric,lot fun thing old learn dinosaur control light...,1
4,5,31-Jul-18,Charcoal Fabric,music,1


In [158]:
data = pd.concat([data,tf],axis=1)

In [159]:
data.drop(['verified_reviews'],axis=1,inplace=True)

In [160]:
data.head()

Unnamed: 0,rating,date,variation,feedback,abay,abc,abd,ability,able,abode,...,you,young,youtube,yrs,yup,zero,zigbee,zonke,zzzz,zzzzzzz
0,5,31-Jul-18,Charcoal Fabric,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,31-Jul-18,Charcoal Fabric,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,31-Jul-18,Walnut Finish,1,0.0,0.0,0.0,0.0,0.229719,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,31-Jul-18,Charcoal Fabric,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,31-Jul-18,Charcoal Fabric,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
X = data.iloc[:,4:].values

In [162]:
y=data.iloc[:,3]

In [163]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3145    1
3146    1
3147    1
3148    1
3149    1
Name: feedback, Length: 3150, dtype: int64

In [164]:
from sklearn.model_selection import train_test_split

In [165]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [166]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [167]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [168]:
predict = model.predict(X_test)

In [169]:
from sklearn import metrics

In [170]:
metrics.accuracy_score(y_test,predict)

0.9259259259259259

In [171]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [172]:
model.fit(X_train,y_train)
predict = model.predict(X_test)
metrics.accuracy_score(y_test,predict)

0.9343915343915344

In [173]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, random_state=0)

In [174]:
model.fit(X_train,y_train)
predict = model.predict(X_test)
metrics.accuracy_score(y_test,predict)

0.944973544973545

In [175]:
from xgboost import XGBClassifier
model = XGBClassifier()

In [176]:
model.fit(X_train,y_train)
predict = model.predict(X_test)
metrics.accuracy_score(y_test,predict)

0.9301587301587302