In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import pickle
from multiprocessing import Pool
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [2]:
#NLTK Packages
import nltk
from nltk import word_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords  

In [3]:
with open('picklefile/data_preprocessing.pickle', 'rb') as data:
    data_set = pickle.load(data)

In [4]:
data_set

Unnamed: 0,content_new,content_nostop,rating,sentiment
0,this kindle is light and easy to use especiall...,"[kindle, light, easy, use, especially, beach]",5.0,1.0
1,didnt know how much i d use a kindle so went f...,"[didnt, know, much, use, kindle, went, lower, ...",4.0,1.0
2,i am 100 happy with my purchase i caught it o...,"[100, happy, purchase, caught, sale, really, g...",5.0,1.0
3,solid entry level kindle great for kids gift...,"[solid, entry, level, kindle, great, kids, gif...",5.0,1.0
4,this make an excellent ebook reader don t exp...,"[make, excellent, ebook, reader, expect, much,...",5.0,1.0
...,...,...,...,...
10623,don t buy it it s horrible not connecting with...,"[buy, horrible, connecting, phone, connected, ...",1.0,2.0
10624,very bad,[bad],2.0,2.0
10625,very bad sound call quality very bad no bass,"[bad, sound, call, quality, bad, bass]",1.0,2.0
10626,sound,[sound],1.0,2.0


In [5]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10628 entries, 0 to 10627
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   content_new     10628 non-null  object 
 1   content_nostop  10628 non-null  object 
 2   rating          10628 non-null  float64
 3   sentiment       10628 non-null  float64
dtypes: float64(2), object(2)
memory usage: 332.2+ KB


In [6]:
noise_words=[]

In [7]:
### Creating a python object of the class CountVectorizer

CV = CountVectorizer(tokenizer= word_tokenize, # type of tokenization
                             stop_words=noise_words, # List of stopwords
                             ngram_range=(1,1)) # number of n-grams

transformed_data = CV.fit_transform(data_set['content_new'])

In [8]:
transformed_data

<10628x9598 sparse matrix of type '<class 'numpy.int64'>'
	with 270565 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix


In [45]:
X = transformed_data.toarray()
y = data_set['sentiment'].values

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [47]:
y_test

array([1., 2., 1., ..., 1., 1., 2.])

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
pipeline_lr = Pipeline([('lr_classifier',LogisticRegression(random_state=0))])

pipeline_dt = Pipeline([('dt_classifier',DecisionTreeClassifier())])

pipeline_rf = Pipeline([('rf_classifier',RandomForestClassifier())])

In [49]:
pipelines = [pipeline_lr,pipeline_dt,pipeline_rf]

In [50]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [51]:
pipe_dict = {0:'Logistic Regression',1:'Decision Tree',2:'RandomForest'}

In [52]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [53]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.9369708372530574
Decision Tree Test Accuracy: 0.8372530573847601
RandomForest Test Accuracy: 0.9167450611476952


In [54]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy = model.score(X_test,y_test)
        best_classifier = i
        best_pipeline = model
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Logistic Regression


In [55]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

LogisticRegression()

In [56]:
y_pred = classifier.predict(X_test)

In [57]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1. 1.]
 [2. 2.]
 [1. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [2. 2.]]


In [58]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.87       502
         1.0       0.94      0.96      0.95       920
         2.0       0.95      0.97      0.96       704

    accuracy                           0.94      2126
   macro avg       0.93      0.93      0.93      2126
weighted avg       0.94      0.94      0.94      2126



In [59]:
confusion_matrix(y_test,y_pred)

array([[425,  48,  29],
       [ 28, 884,   8],
       [ 17,   4, 683]], dtype=int64)

In [60]:
accuracy_score(y_test,y_pred)

0.9369708372530574

##  Sample review prediction

In [83]:
string_in = "It's beyond my expectation, and it can even show music score. Not fast turning though."

X_clean = re.sub('[^A-Za-z0-9]+',' ',string_in)

In [84]:
X_clean = word_tokenize(X_clean)

In [85]:
X_clean

['It',
 's',
 'beyond',
 'my',
 'expectation',
 'and',
 'it',
 'can',
 'even',
 'show',
 'music',
 'score',
 'Not',
 'fast',
 'turning',
 'though']

In [86]:
stop_words = set(stopwords.words('english'))
X_filtered = [w for w in X_clean if not w in stop_words]

In [87]:
#X_filtered=' '.join(map(str,X_filtered))

In [90]:
X_filtered

['It',
 'beyond',
 'expectation',
 'even',
 'show',
 'music',
 'score',
 'Not',
 'fast',
 'turning',
 'though']

In [93]:
cv=CountVectorizer()
X_cv = cv.fit_transform(X_filtered)

In [92]:
#X_pred = X_cv.to_array()

AttributeError: to_array not found

In [94]:
pred = classifier.predict(X_cv) 

ValueError: X has 11 features per sample; expecting 9598

In [74]:
print(CV.get_feature_names())

