# Group 7 Project





In [None]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , LSTM , Embedding
from keras.models import Sequential
from keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

import pickle

In [None]:
# using the SQLite Table to read data.
con = sqlite3.connect('/content/drive/MyDrive/ai/database (1).sqlite') 

In [None]:
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000""", con)

In [None]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [None]:
#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(10)

Number of data points in our data (5000, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,positive,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,positive,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,positive,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,positive,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


# Data Deduplication

In [None]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [None]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(4986, 10)

In [None]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

99.72

# Text Preprocessing

In [None]:
nltk.download('stopwords')
stop=set(stopwords.words('english'))
sno=nltk.SnowballStemmer('english')
def cleanhtml(s):
  a=re.sub('[|>.*?|\.*|?.*?]',"",s)
  return a
def cleanpunc(s):
  a=re.sub('[.|,|!,|)|(|/|\|”|\’|#|@|$|-|%|]',"",s)
  return a

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
i=0;
str1=''
final_string=[]
pos_words=[]
neg_words=[]
s=''
for s in final['Text'].values:
    f=[]
    s=cleanhtml(s)
    for w in s.split():
        for c in cleanpunc(w).split():
            if((c.isalpha())&(len(c)>2)):
                if(c.lower()not in stop):
                    s=(sno.stem(c.lower())).encode('utf8')
                    f.append(s)
                    if(final['Score'].values)[i]=='positive':
                        pos_words.append(s)
                    if(final['Score'].values)[i]=='negative':
                        neg_words.append(s)
                else:
                    continue
            else:
                continue
    str1=b" ".join(f)
    final_string.append(str1)
    i=i+1

In [None]:
#Adding cleaned text in final dataframe
final['CleanedText']=final_string
conn=sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory=str
final.to_sql('Reviews',conn,schema=None,if_exists='replace')

In [None]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,thirty bucks?,Why is this $[...] when the same product is av...,b'product avail victor trap unreal cours total...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...,b'use victor fli bait season beat great product'
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...,b'receiv shipment could hard wait tri product ...


# Test- Train Split

In [None]:
!pip install sklearn
from sklearn.model_selection import train_test_split
X = final['CleanedText']
y = final['Score']

X_train, X_test, y_train, y_test = train_test_split(
    
    X, y, test_size=0.20, random_state=1, stratify=y)



In [None]:
X_train.shape, X_test.shape

((3988,), (998,))

In [None]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()

y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.transform(y_test)

labels = labelEncoder.classes_.tolist()
print(labels) # index-> class

['negative', 'positive']


# Bag Of Words

In [None]:
bow_vectorizer = CountVectorizer(max_features=10000)
bow_vectorizer.fit(X_train)

# transform
bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

# Logistic Regression with BoW

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training
    _ = model.fit(trainX, trainY)

    # predictions
    y_preds_train = model.predict(trainX)
    y_preds_test = model.predict(testX)

    # evaluation
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(y_train, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(y_test, y_preds_test)}")
    print('\n',40*'-')

In [None]:
C = [0.001, 0.01, 0.1, 1, 10]

for c in C: 
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, random_state=1)
    
    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX=bow_X_train,
                   trainY=y_train,
                   testX=bow_X_test,
                   testY=y_test)


LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train accuracy score : 0.8385155466399198
Test accuracy score : 0.8376753507014028

 ----------------------------------------

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train accuracy score : 0.8600802407221665
Test accuracy score : 0.8517034068136272

 ----------------------------------------

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   

# Tf-Idf

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer.fit(X_train)

# transform
tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

# Logistic Regression with Tf-Idf

In [None]:
# Hyperparameters
C = [0.001, 0.01, 0.1, 1, 10]

for c in C: 
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, random_state=1)
    
    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX=tfidf_X_train,
                   trainY=y_train,
                   testX=tfidf_X_test,
                   testY=y_test)


LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train accuracy score : 0.8380140421263791
Test accuracy score : 0.8376753507014028

 ----------------------------------------

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train accuracy score : 0.8380140421263791
Test accuracy score : 0.8376753507014028

 ----------------------------------------

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   

In [None]:
def plot_cm(y_true, y_pred):
    plt.figure(figsize=(6,6))
    
    cm = confusion_matrix(y_true, y_pred, normalize='true')
    
    sns.heatmap(
        cm, annot=True, cmap='Blues', cbar=False, fmt='.2f',
        xticklabels=labels, yticklabels=labels)
    
    return plt.show()

In [None]:
bmodel = LogisticRegression(C=1, max_iter=500, random_state=1)
bmodel.fit(tfidf_X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# predictions
y_preds_train = bmodel.predict(tfidf_X_train)
y_preds_test = bmodel.predict(tfidf_X_test)

In [None]:
print(f"Train accuracy score : {accuracy_score(y_train, y_preds_train)}")
print(f"Test accuracy score : {accuracy_score(y_test, y_preds_test)}")

Train accuracy score : 0.8884152457372116
Test accuracy score : 0.8787575150300602


# Naives- Bayes Classifier with BOW

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas: 
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model=nb_model,
                   trainX=bow_X_train,
                   trainY=y_train,
                   testX=bow_X_test,
                   testY=y_test)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)
Train accuracy score : 0.9771815446339017
Test accuracy score : 0.8657314629258517

 ----------------------------------------

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)
Train accuracy score : 0.9646439317953862
Test accuracy score : 0.8877755511022044

 ----------------------------------------

MultinomialNB(alpha=0.6, class_prior=None, fit_prior=True)
Train accuracy score : 0.9523570712136409
Test accuracy score : 0.8817635270541082

 ----------------------------------------

MultinomialNB(alpha=0.8, class_prior=None, fit_prior=True)
Train accuracy score : 0.9485957873620863
Test accuracy score : 0.875751503006012

 ----------------------------------------

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)
Train accuracy score : 0.9418254764292878
Test accuracy score : 0.8767535070140281

 ----------------------------------------


# Naive Bayes classifier with Tf-Idf

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas: 
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model=nb_model,
                   trainX=tfidf_X_train,
                   trainY=y_train,
                   testX=tfidf_X_test,
                   testY=y_test)


MultinomialNB(alpha=0, class_prior=None, fit_prior=True)
Train accuracy score : 0.966900702106319
Test accuracy score : 0.8587174348697395

 ----------------------------------------

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)
Train accuracy score : 0.9069709127382146
Test accuracy score : 0.8557114228456913

 ----------------------------------------

MultinomialNB(alpha=0.6, class_prior=None, fit_prior=True)
Train accuracy score : 0.8505516549648947
Test accuracy score : 0.8386773547094188

 ----------------------------------------

MultinomialNB(alpha=0.8, class_prior=None, fit_prior=True)
Train accuracy score : 0.843530591775326
Test accuracy score : 0.8376753507014028

 ----------------------------------------

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)
Train accuracy score : 0.8405215646940822
Test accuracy score : 0.8376753507014028

 ----------------------------------------


# Deployment

In [None]:
with open("transformer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)
    
with open("model.pkl", "wb") as f:
    pickle.dump(bmodel, f)

In [None]:
from sklearn import preprocessing
def get_sentiment(review):
    # preprocessing
    x = cleanhtml(review)
    x=cleanpunc(x)
    #vectorization
    x = tfidf_vectorizer.transform([x])
    #prediction
    y = int(bmodel.predict(x.reshape(1,-1)))
    return labels[y]

# Predictions

In [None]:
# positive review
review = "This product is great, I love it"
print(f"This is a {get_sentiment(review)} review!")

This is a positive review!


In [None]:
# negative review
review = "This product is bad, its not worth it"
print(f"This is a {get_sentiment(review)} review!")

This is a negative review!
