In [1]:
import numpy as np 
import pandas as pd 

In [2]:
#loading the dataset 
dump = pd.read_csv('AmazonAlexa_ReviewsDataset.tsv',sep='\t')
dump

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


### Data Pre-Processing 

In [3]:
#keeping comlumns of our interest 
dataset = dump[['verified_reviews','rating']]
dataset.columns=['Reviews','Sentiment']
dataset.head()

Unnamed: 0,Reviews,Sentiment
0,Love my Echo!,5
1,Loved it!,5
2,"Sometimes while playing a game, you can answer...",4
3,I have had a lot of fun with this thing. My 4 ...,5
4,Music,5


In [4]:
#creating a new column sentiment based on overall ratings 
def compute_sentiments(labels):
    sentiments = []
    for label in labels:
        if label>3.0:
            sentiment=1
        elif label<=3.0:
            sentiment=0
        sentiments.append(sentiment)
    return sentiments 

In [5]:
dataset['Sentiment']= compute_sentiments(dataset.Sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Sentiment']= compute_sentiments(dataset.Sentiment)


In [6]:
dataset

Unnamed: 0,Reviews,Sentiment
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1
...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1
3146,"Listening to music, searching locations, check...",1
3147,"I do love these things, i have them running my...",1
3148,Only complaint I have is that the sound qualit...,1


In [7]:
#check the distribution of sentiments 
dataset['Sentiment'].value_counts()

1    2741
0     409
Name: Sentiment, dtype: int64

Interpretation - there are more positive values 


In [8]:
#checl for null values
dataset.isnull().sum()

Reviews      0
Sentiment    0
dtype: int64

Interpretation - none null values 

### Data Transformation 

In [9]:
x= dataset['Reviews']
y= dataset['Sentiment']

In [10]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
from customize_tokenizer_function import CustomTokenizer

In [12]:
#testing 
token = CustomTokenizer()
token.text_data_cleaning("Those were the best Days of my life!")

['good', 'day', 'life']

So, here some components are dropped and we only get words that carry some meaning 

#### Now to convert these components into vector form 
### Feature Engineering (TF-IDF)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [14]:
#single transofrmer in the pipeline 
tfidf = TfidfVectorizer(tokenizer= token.text_data_cleaning)

## Train the model 
### Train/Test Split 

In [15]:
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, stratify = dataset.Sentiment, random_state=0)

In [16]:
x_train.shape, x_test.shape

((2520,), (630,))

2520 samples - training dataset, 630 - test dataset 

## Fit our sentiment analysis model pipeline to the training data 
### Fit x_train and y_train

In [17]:
from sklearn.svm import LinearSVC 
from sklearn.pipeline import Pipeline 

In [18]:
#classifier of the pipeline 
classifier = LinearSVC()

In [19]:
#it will first do vectorization then classification
pipeline = Pipeline([('tfidf',tfidf),('clf',classifier)])

In [20]:
pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<bound method CustomTokenizer.text_data_cleaning of <customize_tokenizer_function.CustomTokenizer object at 0x0000020C9D6D31F0>>)),
                ('clf', LinearSVC())])

## Check Model Performance 

In [21]:
y_predic = pipeline.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [23]:
#confusion_matrix
confusion_matrix(y_test,y_predic)

array([[ 37,  45],
       [ 11, 537]], dtype=int64)

In [24]:
#classification_report
print(classification_report(y_test,y_predic))

              precision    recall  f1-score   support

           0       0.77      0.45      0.57        82
           1       0.92      0.98      0.95       548

    accuracy                           0.91       630
   macro avg       0.85      0.72      0.76       630
weighted avg       0.90      0.91      0.90       630



Interpretation - 91% accuracy 

In [25]:
round(accuracy_score(y_test,y_predic)*100,2)

91.11

## Model Serialization

In [26]:
import joblib 
joblib.dump(pipeline,'sentiment_model.pkl')

['sentiment_model.pkl']

## Predicting sentiments using Model 

Simple way 

In [27]:
#to print output in bold 
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [28]:
prediction = pipeline.predict(["alexa is very bad"])

if prediction ==1:
    print(color.BOLD+"POSITIVE REVIEW"+color.END)
else:
    print(color.BOLD+"NEGATIVE REVIEW"+color.END)

[1mNEGATIVE REVIEW[0m


Another sophisticated way 

In [30]:
new_review=[]
pred_sentiment=[]

while True:
    review = input("what is your review on Alexa\n")
    
    if review == 'skip':
        print("see you soon :) ")
        break 
    else:
        prediction = pipeline.predict([review])
        
    if prediction == 1:
        result = 'POSITIVE'
        print(color.BOLD+"The customer's review is POSITIVE\n\n"+color.END)
    else:
        result = 'NEGATIVE'
        print(color.BOLD+"The customer's review is NEGATIVE\n\n"+color.END)
        
    new_review.append(review)
    pred_sentiment.append(result)

what is your review on Alexa
alexa is stupid
[1mThe customer's review is NEGATIVE

[0m
what is your review on Alexa
alexa is bad
[1mThe customer's review is NEGATIVE

[0m
what is your review on Alexa
skip
see you soon :) 


In [None]:
Results_summary = pd.DataFrame({'New Review': new_review,
                               'Sentiment': pred_sentiment
                               })
Results_summary.to_csv("./alexa_reviews_dataset.tsv", sep='\t', encoding='UTF-8', index=False)
Results_summary

### Model Deployment - using Flask 

In [None]:
showing you in a few seconds !!!!!!!