In [5]:
#Importing all the needed packages to extract data, parse, filter, and develop the model
from importlib.resources import path
import pandas as pd
import random
import string
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk import classify
from nltk import NaiveBayesClassifier
import gzip
import json 

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#This function extracts the data from the json file
def parseData(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

In [3]:
#This function creates the data frame from the extracted data
def getDF(path):
  i = 0
  df = {}
  for d in parseData(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [6]:
#This block of code is only here to test the previous functions

ratings = []

for review in parseData('C:/Users/abdal/Desktop/Uni/20192023 - NLP/data/Grocery_and_Gourmet_Food_5.json.gz'):
  ratings.append(review['overall'])

#prints out the average of the ratings in the product reviews
print(sum(ratings) / len(ratings))

4.442208836745755


In [7]:
#Here is where the data frame is being created for the model
reviews = getDF('C:/Users/abdal/Desktop/Uni/20192023 - NLP/data/Grocery_and_Gourmet_Food_5.json.gz')
sample_df = reviews.groupby('overall').apply(lambda x: x.sample(n=10000)).reset_index(drop = True)

In [8]:
#Here is where the data frame is being divided into categories based on ratings
five_df = sample_df.loc[sample_df['overall'] == 5.0]
five_list = five_df['reviewText'].tolist()

four_df = sample_df.loc[sample_df['overall'] == 4.0]
four_list = four_df['reviewText'].tolist()

three_df = sample_df.loc[sample_df['overall'] == 3.0]
three_list = three_df['reviewText'].tolist()

two_df = sample_df.loc[sample_df['overall'] == 2.0]
two_list = two_df['reviewText'].tolist()

one_df = sample_df.loc[sample_df['overall'] == 1.0]
one_list = one_df['reviewText'].tolist()

#Here is where the data frame turns into a string array
five_list_to_string = ' '.join([str(elem) for elem in five_list])
four_list_to_string = ' '.join([str(elem) for elem in four_list])  
three_list_to_string = ' '.join([str(elem) for elem in three_list])  
two_list_to_string = ' '.join([str(elem) for elem in two_list])  
one_list_to_string = ' '.join([str(elem) for elem in one_list])  

#Here is where the string array is changed to lower case to avoid capitalization bais
five_list_lowered = five_list_to_string.lower()
four_list_lowered = four_list_to_string.lower()
three_list_lowered = three_list_to_string.lower()
two_list_lowered = two_list_to_string.lower()
one_list_lowered = one_list_to_string.lower()

stop = set(stopwords.words('english') + list(string.punctuation))


In [9]:
#Here is where the tokenizer is being created
tokenizer = WhitespaceTokenizer()

#Here is where the string array is being run through the tokenizer
filtered_five_list = [w for w in tokenizer.tokenize(five_list_lowered) if w not in stop] 
filtered_four_list = [w for w in tokenizer.tokenize(four_list_lowered) if w not in stop] 
filtered_three_list = [w for w in tokenizer.tokenize(three_list_lowered) if w not in stop] 
filtered_two_list = [w for w in tokenizer.tokenize(two_list_lowered) if w not in stop] 
filtered_one_list = [w for w in tokenizer.tokenize(one_list_lowered) if w not in stop] 

#Here is where the string array is being stripped of punctuation
filtered_five_list2 = [w.strip(string.punctuation) for w in filtered_five_list]
filtered_four_list2 = [w.strip(string.punctuation) for w in filtered_four_list]
filtered_three_list2 = [w.strip(string.punctuation) for w in filtered_three_list]
filtered_two_list2 = [w.strip(string.punctuation) for w in filtered_two_list]
filtered_one_list2 = [w.strip(string.punctuation) for w in filtered_one_list]


In [10]:
#Here is where the frequency distribution is being calculated
fd_five = nltk.FreqDist(filtered_five_list2) 
fd_four = nltk.FreqDist(filtered_four_list2) 
fd_three = nltk.FreqDist(filtered_three_list2) 
fd_two = nltk.FreqDist(filtered_two_list2) 
fd_one = nltk.FreqDist(filtered_one_list2) 

In [11]:
#Here is a functions that return the features of the words in the string arrays
def word_features(words):
     return dict([(word, True) for word in words.split()])

In [12]:
#Here is where the features are saved using the word_features() function
five_features = [(word_features(words), '5 Stars') for words in filtered_five_list2]
four_features = [(word_features(words), '4 Stars') for words in filtered_four_list2]
three_features = [(word_features(words), '3 Stars') for words in filtered_three_list2]
two_features = [(word_features(words), '2 Stars') for words in filtered_two_list2]
one_features = [(word_features(words), '1 Stars') for words in filtered_one_list2]

#All the labeled words are saved in one array
#This array will be the data that will be used to train and validate the model
labeledwords = five_features + four_features + three_features + two_features + one_features

In [13]:
#Here is where the data is getting shuffled to aviod bias
random.shuffle(labeledwords)

In [14]:
#Here is where the data is getting split
#Count is used to calculate the 80:20 split between data
count = round(len(labeledwords) * 0.8)
train_set, test_set = labeledwords[:count], labeledwords[count:]

In [15]:
#Here is where the Naive Bayes Classifier is being trained
naiveBayesClassifier = nltk.NaiveBayesClassifier.train(train_set)

In [17]:
#Here is where the Decision Tree Classifier model is being trained
decisionTreeClassifier = nltk.DecisionTreeClassifier.train(train_set)

KeyboardInterrupt: 

In [20]:
#Here is where the Maxent Classifier model is being trained
maxentClassifier = nltk.MaxentClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.60944        0.151
             2          -1.43133        0.321
             3          -1.41052        0.321
             4          -1.39921        0.321
             5          -1.39205        0.321
             6          -1.38710        0.321
             7          -1.38346        0.321
             8          -1.38067        0.321
             9          -1.37847        0.321
            10          -1.37668        0.321
            11          -1.37519        0.321
            12          -1.37394        0.321
            13          -1.37288        0.321
            14          -1.37196        0.321
            15          -1.37115        0.321
            16          -1.37044        0.321
            17          -1.36981        0.321
            18          -1.36925        0.321
            19          -1.36875        0.321
 

In [21]:
#Here is where the accuracy of each model is found
#The model with the highest accuracy will be used
print(nltk.classify.accuracy(naiveBayesClassifier, test_set)*100)
print(nltk.classify.accuracy(decisionTreeClassifier, test_set)*100)
print(nltk.classify.accuracy(maxentClassifier, test_set)*100)

27.895344672871623
27.61527658735428


In [16]:
#Importing the tkinter package that is being used to create the GUI
from tkinter import *
from tkinter.font import BOLD

#Here is were the tkinter canvas is created
root = Tk()
root.title("Satisfaction Predictor For Product Reviews")
root.geometry("1000x600")
root.resizable(width=FALSE,height=FALSE)
root["bg"]="#202020"

#The welcome message is created here using labels
label1 = Label(root, text="Welcome to Stari",  width="12", height=5,bd=0, background="#202020",foreground='white',font=("Catamaran", 50))
label1.place(x=175,y=5, height=100, width=650)

label2 = Label(root, text="A Satisfaction Predictor For Product Reviews!",  width="12", height=5,bd=0, background="#202020",foreground='white',font=("Catamaran", 16))
label2.place(x=200,y=90, height=50, width=600)

label8 = Label(root, text="Just write down a review and stari will predict your satisfaction level", background="#202020", width="12", height=5,bd=0,foreground='white',font=("Catamaran", 12))
label8.place(x=200,y=140, height=40, width=600)

label8 = Label(root, text="", background="#202020", width="12", height=5,bd=0,foreground='white',font=("Catamaran", 26))
label8.place(x=200,y=425, height=40, width=600)

#Here is where the input box is created
chatWindow1 = Text(root, bd=1, bg="white",  width="50", height="8", font=("Catamaran", 16), foreground="white", padx=10, pady=10,wrap=WORD,background="#0f0f0f",borderwidth=0)
chatWindow1.place(x=200,y=200, height=200, width=600)

#This method is used to extract the input from the input box and run it through the model
#After the model predicts the satisfaction level of the user, it prints it onto a label
def printReview(userInput):
    prediction = naiveBayesClassifier.classify(word_features(userInput))
    if(prediction == "1 Stars"):
        prediction = "Extremely Unsatisfactied"
    elif(prediction == "2 Stars"):
        prediction = "Unsatisfactied"
    elif(prediction == "3 Stars"):
        prediction = "Neutral"
    elif(prediction == "4 Stars"):
        prediction = "Satisfactied"
    elif(prediction == "5 Stars"):
        prediction = "Extremely Satisfactied"
    
    label8.config(text = prediction)
    

#Here is where the button is created. When the button is pressed, it will call the printReview() function
button1 = Button(root, text="Proceed",  width="12", height=5,bd=0, bg="#2d2d2d", activebackground="white",foreground='#ffffff',font=("Catamaran", 20), command=lambda: printReview(chatWindow1.get("1.0", END)))
button1.place(x=250,y=525, height=50, width=500)

root.mainloop()