In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
%matplotlib inline


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud


import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

In [3]:
from pylab import rcParams
import warnings
warnings.filterwarnings('ignore')
rcParams['figure.figsize']=14,6
plt.style.use('ggplot')

In [9]:
with open('picklefile/prepare_data_set.pickle', 'rb') as data:
    df = pickle.load(data)

In [10]:
df['sentiment']=df['sentiment'].astype('int')

In [11]:
df.head()

Unnamed: 0,content,rating,sentiment,text length
0,This kindle is light and easy to use especiall...,5.0,1,63
1,Didnt know how much i'd use a kindle so went f...,4.0,1,107
2,I am 100 happy with my purchase. I caught it o...,5.0,1,757
3,Solid entry level Kindle. Great for kids. Gift...,5.0,1,176
4,This make an excellent ebook reader. Don't exp...,5.0,1,158


In [12]:
df[df['sentiment']==0]

Unnamed: 0,content,rating,sentiment,text length
18,I was looking for a kindle whitepaper. I saw o...,1.0,0,511
67,Looking at the picture and seeing it was 8th g...,1.0,0,651
75,it would not load my books proper. took a doze...,2.0,0,96
86,"The screen is too dark, and cannot adjust the ...",2.0,0,56
136,I have to say it was a little confusing and fr...,2.0,0,420
...,...,...,...,...
8691,Poor quality. It has not been few months and i...,1.0,0,142
8692,Sound quality is not good at all. Muddy sound ...,1.0,0,124
8693,"flimsy product, dispite of minimal use, rubber...",1.0,0,127
8694,Bass effect very low. Spend some more bucks an...,2.0,0,75


In [13]:
df.shape

(8696, 4)

In [14]:
df.iloc[1208]['content']

'Best Buy associate explained product very well. Easy to use'

In [15]:
df.iloc[459]['sentiment']

1

## Splitting the data into train data and test data 

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['content'],df['sentiment'], test_size=0.25, random_state=101)

print('Loading %d training examples and %d validation examples. \n' %(X_train.shape[0],X_test.shape[0]))

Loading 6522 training examples and 2174 validation examples. 



## Function for Text Preprocessing

In [17]:
import re
# Creating a function to do text preprocessing
## 2. Remove non-character such as digits and symbols
## 3. Convert to lower case
## 4. Remove stop words such as "the" and "and" if needed
## 5. Convert to root words by stemming if needed

def cleanText(text, remove_stopwords=True, Lemmatization=True):
    
    # Removing non-character
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    
    # converting to lower case
    words = letters_only.lower().split()  
    
    
    #Lemmatization
    if Lemmatization==True: 
        Lemmatizer = WordNetLemmatizer() 
        words = [Lemmatizer.lemmatize(w) for w in words]
    
    # Removing stopword
    if remove_stopwords==True: 
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # Stemming
    #if stemming==True: 
     #   stemmer = PorterStemmer('english') 
     #   words = [stemmer.stem(w) for w in words]
    
    
    return( " ".join(words))

In [18]:

# Preprocessing text data in training set and validation set

X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Showing a cleaned review in the training set : \n\n',  X_train_cleaned[344])
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

Showing a cleaned review in the training set : 

 love amazon echo plus bad sound get bed turn light fan even tv love alexa


In [19]:
# Fitting and transform the training data to a document-term matrix using CountVectorizer

countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)

#Showing what are the words which are given much importance by the model
print("Number of features : %d \n" %len(countVect.get_feature_names()))  
print("Show some feature names : \n", countVect.get_feature_names()[::500])



Number of features : 6395 

Show some feature names : 
 ['aa', 'batery', 'clip', 'detailed', 'exactly', 'grate', 'item', 'mickey', 'perchues', 'receptive', 'shes', 'surroundings', 'usual']


In [20]:
type(X_train_countVect)

scipy.sparse.csr.csr_matrix

## Building a Machine Learning Model 

In [21]:
from sklearn.pipeline import Pipeline
pipeline_lr = Pipeline([('lr_classifier',LogisticRegression(random_state=0))])

pipeline_dt = Pipeline([('dt_classifier',DecisionTreeClassifier())])

pipeline_rf = Pipeline([('rf_classifier',RandomForestClassifier())])

pipeline_mnb = Pipeline([('mnb_classifier',MultinomialNB())])

In [22]:
pipelines = [pipeline_lr,pipeline_dt,pipeline_rf,pipeline_mnb]
best_accuracy=0.0
best_classifier=0
best_pipeline=""
pipe_dict = {0:'Logistic Regression',1:'Decision Tree',2:'RandomForest',3:'MultiNomialNB'}

In [23]:
for pipe in pipelines:
    pipe.fit(X_train_countVect,y_train)

In [24]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(countVect.transform(X_test_cleaned),y_test)))

Logistic Regression Test Accuracy: 0.9415823367065317
Decision Tree Test Accuracy: 0.890984360625575
RandomForest Test Accuracy: 0.9443422263109476
MultiNomialNB Test Accuracy: 0.9314627414903404


In [25]:
for i,model in enumerate(pipelines):
    if model.score(countVect.transform(X_test_cleaned),y_test)>best_accuracy:
        best_accuracy = model.score(countVect.transform(X_test_cleaned),y_test)
        best_classifier = i
        best_pipeline = model
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:RandomForest


In [32]:
classifier = RandomForestClassifier()
classifier.fit(X_train_countVect,y_train)

RandomForestClassifier()

In [33]:
def modelEvaluation(predictions):
    
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [34]:
# Evaluating the model on validaton set

predictions = classifier.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.9430

Classification report : 
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      1010
           1       0.94      0.96      0.95      1164

    accuracy                           0.94      2174
   macro avg       0.94      0.94      0.94      2174
weighted avg       0.94      0.94      0.94      2174


Confusion Matrix : 
 [[ 937   73]
 [  51 1113]]


## Sample Prediction 

In [56]:
text = """ Pretty much satisfied.
Super fast processing, awesome screen colors, robust build, good keyboard and touch pad.
Good for all the productivity stuff like video editing, animation softwares, graphics software, heavy coding environments and off course games"""
clean_text = cleanText(text)
clean_text1 = clean_text.split()
print(clean_text1)
pred = classifier.predict(countVect.transform(clean_text1))

pred


['pretty', 'much', 'satisfied', 'super', 'fast', 'processing', 'awesome', 'screen', 'color', 'robust', 'build', 'good', 'keyboard', 'touch', 'pad', 'good', 'productivity', 'stuff', 'like', 'video', 'editing', 'animation', 'software', 'graphic', 'software', 'heavy', 'coding', 'environment', 'course', 'game']


array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [57]:
End_prediction = np.bincount(pred)
End_prediction

array([29,  1], dtype=int64)

In [58]:
End_prediction = np.bincount(pred)
End_prediction_dict = {0:End_prediction[0],1:End_prediction[1]}
if End_prediction_dict[0] > End_prediction_dict[1]: 
    print(np.delete(End_prediction,1))
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0])
    print("Sorry for the inconvenience. Our support staff will contact you and sort out the problem.")
else:
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0])
    print("Thank You for your feedback!😊")

[29]
Positive words: 1
Negative words: 29
Sorry for the inconvenience. Our support staff will contact you and sort out the problem.


---

# TF-IDF Model

In [38]:
 #Fit and transform the training data to a document-term matrix using TfidfVectorizer 


tfidf = TfidfVectorizer(min_df=5) # Taking a minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names()))
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

#  Instatiating and fitting Logistic Regression object

lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

Number of features : 2394 

Show some feature names : 
 ['00', 'here', 'station']


LogisticRegression()

In [39]:
# Looking at the top 10 features with smallest and the largest coefficients

feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Top 10 features with largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))


Top 10 features with smallest coefficients :
['not' 'batteries' 'slow' 'returned' 'disappointed' 'after' 'don' 'poor'
 'return' 'charge']

Top 10 features with largest coefficients : 
['great' 'love' 'easy' 'echo' 'alexa' 'my' 'well' 'gift' 'loves' 'tablet']


In [40]:
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)


# Grid search
params = {"lr__C":[0.1, 1, 10], #regularization param of logistic regression
          "tfidf__min_df": [1, 3], #min count of words 
          "tfidf__max_features": [1000, None], #max features
          "tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          "tfidf__stop_words": [None, "english"]} #use stopwords or don't

grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cleaned, y_train)
print("The best paramenter set is : \n", grid.best_params_)


# Evaluate on the validaton set
predictions = grid.predict(X_test_cleaned)
modelEvaluation(predictions)

The best paramenter set is : 
 {'lr__C': 10, 'tfidf__max_features': None, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}

Accuracy on validation set: 0.9512

Classification report : 
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      1010
           1       0.95      0.96      0.95      1164

    accuracy                           0.95      2174
   macro avg       0.95      0.95      0.95      2174
weighted avg       0.95      0.95      0.95      2174


Confusion Matrix : 
 [[ 947   63]
 [  43 1121]]


## Sample Prediction

In [59]:
text = """Pretty much satisfied.
Super fast processing, awesome screen colors, robust build, good keyboard and touch pad.
Good for all the productivity stuff like video editing, animation softwares, graphics software, heavy coding environments and off course games"""
clean_text = cleanText(text)
clean_text1 = clean_text.split()
print(clean_text1)
pred1 = grid.predict(clean_text1)

pred1

['pretty', 'much', 'satisfied', 'super', 'fast', 'processing', 'awesome', 'screen', 'color', 'robust', 'build', 'good', 'keyboard', 'touch', 'pad', 'good', 'productivity', 'stuff', 'like', 'video', 'editing', 'animation', 'software', 'graphic', 'software', 'heavy', 'coding', 'environment', 'course', 'game']


array([1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1])

In [60]:
End_prediction = np.bincount(pred1)
End_prediction

array([15, 15], dtype=int64)

In [61]:
End_prediction = np.bincount(pred1)
End_prediction_dict = {0:End_prediction[0],1:End_prediction[1]}
if End_prediction_dict[0] > End_prediction_dict[1]: 
    print(np.delete(End_prediction,1))
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0])
    print("it may be a negative review or compliant or both")
else:
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0])
    print("It is a positive review")

Positive words: 15
Negative words: 15
It is a positive review
