# EXPERIMENT 5

In [24]:
import pandas as pd
import re
from textblob import TextBlob
from autocorrect import Speller
from textblob import Word
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [25]:
data = pd.read_csv('amazon_alexa.csv', sep='\t')
data.head(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [26]:
len(data)

3150

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [28]:
data.verified_reviews.isnull().sum()

0

In [29]:
data.feedback.isnull().sum()

0

In [30]:
data.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [31]:
reviews = data.verified_reviews
reviews

0                                           Love my Echo!
1                                               Loved it!
2       Sometimes while playing a game, you can answer...
3       I have had a lot of fun with this thing. My 4 ...
4                                                   Music
                              ...                        
3145    Perfect for kids, adults and everyone in betwe...
3146    Listening to music, searching locations, check...
3147    I do love these things, i have them running my...
3148    Only complaint I have is that the sound qualit...
3149                                                 Good
Name: verified_reviews, Length: 3150, dtype: object

# Removing special characters and converting words to lower case

In [32]:
new_list = []
for i in reviews :
    new_string = re.sub(r"[^a-zA-Z0-9]"," ",i)
    new_string = new_string.lower()
    new_list.append(new_string)

df = pd.DataFrame(new_list)
df

Unnamed: 0,0
0,love my echo
1,loved it
2,sometimes while playing a game you can answer...
3,i have had a lot of fun with this thing my 4 ...
4,music
...,...
3145,perfect for kids adults and everyone in betwe...
3146,listening to music searching locations check...
3147,i do love these things i have them running my...
3148,only complaint i have is that the sound qualit...


# Removing extra blank spaces

In [33]:
lst = []
for i in new_list :
    i = re.sub('[\s]+', ' ', i)
    i = re.sub('[\n]+', ' ', i)
    lst.append(i)
df = pd.DataFrame(lst)
df

Unnamed: 0,0
0,love my echo
1,loved it
2,sometimes while playing a game you can answer ...
3,i have had a lot of fun with this thing my 4 y...
4,music
...,...
3145,perfect for kids adults and everyone in between
3146,listening to music searching locations checkin...
3147,i do love these things i have them running my ...
3148,only complaint i have is that the sound qualit...


# Lemmatization

In [34]:
new_lst=[]
for i in lst :
    sent =" ".join([Word(word).lemmatize() for word in i.split()])
    new_lst.append(sent)
reviews = pd.DataFrame(new_lst)
reviews

Unnamed: 0,0
0,love my echo
1,loved it
2,sometimes while playing a game you can answer ...
3,i have had a lot of fun with this thing my 4 y...
4,music
...,...
3145,perfect for kid adult and everyone in between
3146,listening to music searching location checking...
3147,i do love these thing i have them running my e...
3148,only complaint i have is that the sound qualit...


# Removing stopwords

In [35]:
stop_words = set(stopwords.words('english'))

In [36]:
temp=[]
for i in new_lst :
    s = " ".join([w for w in i.split() if w not in stop_words])
    temp.append(s)
df = pd.DataFrame(temp)
df

Unnamed: 0,0
0,love echo
1,loved
2,sometimes playing game answer question correct...
3,lot fun thing 4 yr old learns dinosaur control...
4,music
...,...
3145,perfect kid adult everyone
3146,listening music searching location checking ti...
3147,love thing running entire home tv light thermo...
3148,complaint sound quality great mostly use comma...


# Concatinating dataframes

In [37]:
newData = pd.concat([data,df],axis = 1)
newData.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,0
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,love echo
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,loved
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,sometimes playing game answer question correct...
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,lot fun thing 4 yr old learns dinosaur control...
4,5,31-Jul-18,Charcoal Fabric,Music,1,music


In [38]:
newData = newData.drop(['rating','date','variation','verified_reviews'],axis=1)
newData.rename(columns = {0:'review'}, inplace = True)
newData

Unnamed: 0,feedback,review
0,1,love echo
1,1,loved
2,1,sometimes playing game answer question correct...
3,1,lot fun thing 4 yr old learns dinosaur control...
4,1,music
...,...,...
3145,1,perfect kid adult everyone
3146,1,listening music searching location checking ti...
3147,1,love thing running entire home tv light thermo...
3148,1,complaint sound quality great mostly use comma...


# Data Splitting

In [16]:
newData = newData.sample(frac=1)
newData.head()

Unnamed: 0,feedback,review
2557,1,un excelente equipo inteligente al estar conec...
153,1,pleasantly surprised sound quality many featur...
2813,1,love thing work great kid love
1560,1,love echo show dot using last 6 month show muc...
252,1,enjoyed entire echo experience


In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train, Y_test = train_test_split(newData['review'], newData['feedback'], test_size=0.25, random_state=30)

print('X_train : ' , X_train.shape)
print('Y_train : ' , Y_train.shape)
print('X_test : ' , X_test.shape)
print('Y_test : ' , Y_test.shape)

X_train :  (2362,)
Y_train :  (2362,)
X_test :  (788,)
Y_test :  (788,)


In [18]:
X_train

2971                                             set easy
769                                         always listen
1424    second echo show home mexico one usa one u hom...
2204    love fire stick installing highly recommended ...
2593                                     alexa phenomenal
                              ...                        
588     work great control tv light various device lat...
3132                                           work great
887     really love amazon echo think sound quality gr...
3095                                         doe expected
2306                                    love amazon stick
Name: review, Length: 2362, dtype: object

# TF IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [21]:
tf_x_train.shape

(2362, 3156)

# Model

In [80]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [81]:
clf.fit(tf_x_train,Y_train)

LinearSVC(random_state=0)

In [82]:
y_test_pred=clf.predict(tf_x_test)

In [83]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, y_test_pred)

0.9403553299492385

In [84]:
df = pd.DataFrame(Y_test)
df["Predicted_val"] = pd.DataFrame(y_test_pred)

In [86]:
Y_test.value_counts()

1    724
0     64
Name: feedback, dtype: int64

In [87]:
df['Predicted_val'].value_counts()

1.0    177
0.0      6
Name: Predicted_val, dtype: int64

# Prediction

In [22]:
def analyze_sentiment(review) : 
    
    analysis = TextBlob(review)
    
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [43]:
newData['Sentiment'] = newData['review'].apply(lambda x: analyze_sentiment(x))

newData[['review', 'Sentiment']].head(5)

Unnamed: 0,review,Sentiment
0,love echo,Positive
1,loved,Positive
2,sometimes playing game answer question correct...,Neutral
3,lot fun thing 4 yr old learns dinosaur control...,Positive
4,music,Neutral
