In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from multiprocessing import Pool
%matplotlib inline


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud


import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

import logging
from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
#from keras.layers.embeddings import Embedding



In [3]:
from pylab import rcParams
import warnings
warnings.filterwarnings('ignore')
rcParams['figure.figsize']=14,6
plt.style.use('ggplot')

In [4]:
with open('picklefile/prepare_data_set.pickle', 'rb') as data:
    df = pickle.load(data)

In [5]:
df['sentiment']=df['sentiment'].astype('int')

In [6]:
df.head()

Unnamed: 0,content,rating,sentiment,text length
0,This kindle is light and easy to use especiall...,5.0,1,63
1,Didnt know how much i'd use a kindle so went f...,4.0,1,107
2,I am 100 happy with my purchase. I caught it o...,5.0,1,757
3,Solid entry level Kindle. Great for kids. Gift...,5.0,1,176
4,This make an excellent ebook reader. Don't exp...,5.0,1,158


In [7]:
df[df['sentiment']==0]

Unnamed: 0,content,rating,sentiment,text length
18,I was looking for a kindle whitepaper. I saw o...,1.0,0,511
67,Looking at the picture and seeing it was 8th g...,1.0,0,651
75,it would not load my books proper. took a doze...,2.0,0,96
86,"The screen is too dark, and cannot adjust the ...",2.0,0,56
136,I have to say it was a little confusing and fr...,2.0,0,420
...,...,...,...,...
7191,This is exactly like any other usb power charg...,1.0,0,155
7192,Amazon should include this charger with the Ki...,1.0,0,255
7193,Love my Kindle Fire but I am really disappoint...,1.0,0,314
7194,I was surprised to find it did not come with a...,1.0,0,231


In [8]:
df.shape

(9372, 4)

In [9]:
df.iloc[7195]['content']

"to spite the fact that i have nothing but good things to say about amazon and anthing i've ever gotten from them. and that i love my fire. i find it greedy that the wall charger doesn't come with the kindle. not everyone, ok most people, but still not everyone has a usb port to plug into. i'm taking my charger back. i think amazon should make things right and let anyone who purchased a kindle without a charger have one for free, or credit those who had to buy one."

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['content'],df['sentiment'], test_size=0.25, random_state=101)

print('Loading %d training examples and %d validation examples. \n' %(X_train.shape[0],X_test.shape[0]))

Loading 7029 training examples and 2343 validation examples. 



In [12]:
import re
# Creating a function to do text preprocessing
## 2. Remove non-character such as digits and symbols
## 3. Convert to lower case
## 4. Remove stop words such as "the" and "and" if needed
## 5. Convert to root words by stemming if needed

def cleanText(text, remove_stopwords=False, Lemmatization=False,split_text=False):
    
    # Removing non-character
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    
    # converting to lower case
    words = letters_only.lower().split()  
    
    
    #Lemmatization
    if Lemmatization==True: 
        Lemmatizer = WordNetLemmatizer('english') 
        words = [Lemmatizer.lemmatize(w) for w in words]
    
    # Removing stopword
    if remove_stopwords==True: 
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # Stemming
    #if stemming==True: 
     #   stemmer = PorterStemmer('english') 
     #   words = [stemmer.stem(w) for w in words]
    
    
    # Spliting text    
    if split_text==True:  
        return (words)
    
    return( " ".join(words))

In [13]:

# Preprocessing text data in training set and validation set

X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Showing a cleaned review in the training set : \n\n',  X_train_cleaned[344])
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

Showing a cleaned review in the training set : 

 it s not worth to buy i got issue with in one month from purchase date one ear sound stopped suddenly please don t waste your money on this product


In [15]:
# Fitting and transform the training data to a document-term matrix using CountVectorizer

countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(countVect.get_feature_names())) #6378 
print("Show some feature names : \n", countVect.get_feature_names()[::1000])


# Train MultinomialNB classifier

mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

Number of features : 7681 

Show some feature names : 
 ['aa', 'cansilation', 'dollars', 'grow', 'lowes', 'population', 'sheets', 'trusted']


MultinomialNB()

In [16]:
type(X_train_countVect)

scipy.sparse.csr.csr_matrix

In [17]:
from sklearn.pipeline import Pipeline
pipeline_lr = Pipeline([('lr_classifier',LogisticRegression(random_state=0))])

pipeline_dt = Pipeline([('dt_classifier',DecisionTreeClassifier())])

pipeline_rf = Pipeline([('rf_classifier',RandomForestClassifier())])

pipeline_mnb = Pipeline([('mnb_classifier',MultinomialNB())])

In [18]:
pipelines = [pipeline_lr,pipeline_dt,pipeline_rf,pipeline_mnb]
best_accuracy=0.0
best_classifier=0
best_pipeline=""
pipe_dict = {0:'Logistic Regression',1:'Decision Tree',2:'RandomForest',3:'MultiNomialNB'}

In [19]:
for pipe in pipelines:
    pipe.fit(X_train_countVect,y_train)

In [20]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(countVect.transform(X_test_cleaned),y_test)))

Logistic Regression Test Accuracy: 0.939820742637644
Decision Tree Test Accuracy: 0.830559112249253
RandomForest Test Accuracy: 0.9086641058472045
MultiNomialNB Test Accuracy: 0.9321382842509603


In [21]:
for i,model in enumerate(pipelines):
    if model.score(countVect.transform(X_test_cleaned),y_test)>best_accuracy:
        best_accuracy = model.score(countVect.transform(X_test_cleaned),y_test)
        best_classifier = i
        best_pipeline = model
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Logistic Regression


In [84]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_countVect,y_train)

LogisticRegression(random_state=0)

In [85]:
def modelEvaluation(predictions):
    
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [86]:
# Evaluating the model on validaton set

predictions = classifier.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.9398

Classification report : 
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       645
           1       0.95      0.96      0.95      1163
           2       0.97      0.95      0.96       535

    accuracy                           0.94      2343
   macro avg       0.94      0.93      0.94      2343
weighted avg       0.94      0.94      0.94      2343


Confusion Matrix : 
 [[ 579   54   12]
 [  43 1116    4]
 [  19    9  507]]


In [110]:
text = "Not Recommended Delivered defective product. IFB customer care executive are very unprofessional. Even after several requests they have not sent any technician to solve the issue.Very disappointed. Not a trust worthy brand.Please don't go for it."
clean_text = cleanText(text)
clean_text1 = clean_text.split()
clean_text1
pred = mnb.predict(countVect.transform(clean_text1))

pred


array([2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2, 0, 1, 0,
       1, 2, 1, 2, 1, 0, 2, 1, 2, 2, 0, 2, 0, 1, 1, 1, 1])

In [111]:
End_prediction = np.bincount(pred)
End_prediction

array([ 6, 16, 17], dtype=int64)

In [112]:
End_prediction = np.bincount(pred)
End_prediction_dict = {0:End_prediction[0],1:End_prediction[1],2:End_prediction[2]}
if End_prediction_dict[0]+End_prediction_dict[2] > End_prediction_dict[1]: 
    print(np.delete(End_prediction,1))
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0]+End_prediction_dict[2])
    print("it may be a negative review or compliant or both")
else:
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0]+End_prediction_dict[2])
    print("It is a positive review")

[ 6 17]
Positive words: 16
Negative words: 23
it may be a negative review or compliant or both


---

In [61]:
 #Fit and transform the training data to a document-term matrix using TfidfVectorizer 


tfidf = TfidfVectorizer(min_df=5) # Taking a minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names()))
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

#  Instatiating and fitting Logistic Regression object

lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

Number of features : 2459 

Show some feature names : 
 ['00', 'head', 'son']


LogisticRegression()

In [62]:
# Looking at the top 10 features with smallest and the largest coefficients

feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Top 10 features with largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))


Top 10 features with smallest coefficients :
['sound' 'love' 'product' 'great' 'easy' 'earphones' 'is' 'ear' 'working'
 'earphone']

Top 10 features with largest coefficients : 
['batteries' 'slow' 'returned' 'not' 'these' 'last' 'amazon' 'dead'
 'terrible' 'remote']


In [63]:
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)


# Grid search
params = {"lr__C":[0.1, 1, 10], #regularization param of logistic regression
          "tfidf__min_df": [1, 3], #min count of words 
          "tfidf__max_features": [1000, None], #max features
          "tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          "tfidf__stop_words": [None, "english"]} #use stopwords or don't

grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cleaned, y_train)
print("The best paramenter set is : \n", grid.best_params_)


# Evaluate on the validaton set
predictions = grid.predict(X_test_cleaned)
modelEvaluation(predictions)

The best paramenter set is : 
 {'lr__C': 10, 'tfidf__max_features': None, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}

Accuracy on validation set: 0.9518

Classification report : 
               precision    recall  f1-score   support

           0       0.95      0.90      0.92       645
           1       0.94      0.98      0.96      1163
           2       0.99      0.96      0.97       535

    accuracy                           0.95      2343
   macro avg       0.96      0.94      0.95      2343
weighted avg       0.95      0.95      0.95      2343


Confusion Matrix : 
 [[ 578   64    3]
 [  18 1141    4]
 [  11   13  511]]


In [107]:
text = "Not Recommended Delivered defective product. IFB customer care executive are very unprofessional. Even after several requests they have not sent any technician to solve the issue.Very disappointed. Not a trust worthy brand.Please don't go for it."
clean_text = cleanText(text)
clean_text1 = clean_text.split()
clean_text1
pred1 = grid.predict(clean_text1)

pred1

array([2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 0, 1, 2, 2, 2, 1,
       0, 1, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 0, 1, 2, 1, 1])

In [108]:
End_prediction = np.bincount(pred1)
End_prediction

array([ 5, 15, 19], dtype=int64)

In [109]:
End_prediction = np.bincount(pred1)
End_prediction_dict = {0:End_prediction[0],1:End_prediction[1],2:End_prediction[2]}
if End_prediction_dict[0]+End_prediction_dict[2]+4 > End_prediction_dict[1]: 
    print(np.delete(End_prediction,1))
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0]+End_prediction_dict[2])
    print("it may be a negative review or compliant or both")
else:
    print("Positive words:",End_prediction_dict[1])
    print("Negative words:",End_prediction_dict[0]+End_prediction_dict[2])
    print("It is a positive review")

[ 5 19]
Positive words: 15
Negative words: 24
it may be a negative review or compliant or both


In [64]:

df_cleaned_text= pd.Series(X_train_cleaned+X_test_cleaned)
df_cleaned_text.head()

0    the fire tablet is great we just bought our se...
1    unexpected sound quality too much treble uncom...
2    i was so excited to start streaming k with thi...
3    many features packed into a low price good as ...
4    i picked up the show on a black friday deal i ...
dtype: object

In [None]:
df_predicted= df[['rating', 'sentiment']]
df_predicted['cleaned Review']= df_cleaned_text
df_predicted= df_predicted[['cleaned Review', 'rating','sentiment']]
df_predicted.head()