In [27]:
# import the necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
df = pd.read_csv('IMDB_Dataset.csv')

In [30]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [31]:
# We replace all the characters and html tags in the dataset that could cause a parse error, pre-processing the data
df.replace('"', '', inplace=True, regex=True)
#<br /> will be replaced with a space because most if not all of the <br /> come before a sentence and 
# it confuses the machine when is the start of a sentence
df.replace('<br />', ' ', inplace=True, regex=True)
df.replace('<p>', '', inplace=True, regex=True)
df.replace('<i>', '', inplace=True, regex=True)
df.replace('</i>', '', inplace=True, regex=True)
df.replace('<em>', '', inplace=True, regex=True)
df.replace('</em>', '', inplace=True, regex=True)
df.replace("'", "", inplace=True, regex=True)

In [32]:
# Checking how many positive and negative sentiments we have
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [33]:
# grouping the sentiments to visualize which reviews are postive and which are negative
sentiment_groupby_df = df.groupby('sentiment')
sentiment_groupby_df.apply(display)

Unnamed: 0,review,sentiment
3,Basically theres a family where a little boy (...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative
...,...,...
49994,This is your typical junk comedy. There are a...,negative
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,Im going to have to disagree with the previous...,negative


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming t...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,Petter Matteis Love in the Time of Money is a ...,positive
5,"Probably my all-time favorite movie, a story o...",positive
...,...,...
49983,"I loved it, having been a fan of the original ...",positive
49985,Imaginary Heroes is clearly the best film of t...,positive
49989,I got this one a few weeks ago and love it! It...,positive
49992,John Garfield plays a Marine who is blinded by...,positive


In [34]:
# Now we divide the data frame into categories based on sentiment positive and negative
positive_df = df[(df['sentiment'] == 'positive')]
positive_reviews_list = positive_df['review'].tolist()

negative_df = df[(df['sentiment'] == 'negative')]
negative_reviews_list = negative_df['review'].tolist()


In [35]:
# We convert the list into one big string with a space between each element
# And we lowercase all letters to ensure tokenization considers all upper and lower case, same words as the same token
positive_string = ' '.join([str(i).lower() for i in positive_reviews_list])
negative_string = ' '.join([str(i).lower() for i in negative_reviews_list])

In [36]:
stop = set(stopwords.words('english') + list(string.punctuation))

In [37]:
# We use Tweet Tokenizer because reviews are "casual" in nature
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(reduce_len=True)

In [38]:
# Tokenize the words in both lists and remove stop words
filtered_positive_list = [word for word in tknzr.tokenize(positive_string) if word not in stop]
filtered_negative_list = [word for word in tknzr.tokenize(negative_string) if word not in stop]

In [39]:
# Remove punctuations now to make sure punctuations don't cause a bias
filtered_positive_list_strip = [word.strip(string.punctuation) for word in filtered_positive_list]
filtered_negative_list_strip = [word.strip(string.punctuation) for word in filtered_negative_list]

In [40]:
# Frequency distribution is to analysis the distribution of each word in both positive and negative lists
freq_dist_positive = nltk.FreqDist(filtered_positive_list)
freq_dist_negative = nltk.FreqDist(filtered_negative_list)

temp_pd = pd.DataFrame(list(freq_dist_positive. items()), columns = ["Word","Frequency"]).sort_values(by="Frequency", ascending=False)
temp_pd.shape

(101527, 2)

In [41]:
# This function returns the features of the words in the string arrays
# .split() to remove the white spaces if any
def word_features(words):
     return dict([(word, True) for word in words.split()])

In [42]:
# Make dictionary to save the features of the review to the corresponding sentiment
pos_features = [(word_features(words), 'positive') for words in filtered_positive_list_strip]
neg_features = [(word_features(words), 'negative') for words in filtered_negative_list_strip]

In [43]:
# This new list of data will be used for training and testing the model
labeled_words = pos_features + neg_features

In [44]:
# shuffling data to eliminate bias
import random
random.shuffle(labeled_words)

In [45]:
# checkpoint is where we split our data into training and testing (90:10)
checkpoint = round(len(labeled_words) * 0.8)

train_set, test_set = labeled_words[:checkpoint], labeled_words[checkpoint:]

In [46]:
# Training our Naive Bayes Classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [47]:
# This is our Naive Bayes Classifier's accuracy
accuracy = nltk.classify.accuracy(classifier, test_set)*100
accuracy

58.89563674044421

In [52]:
from openai import OpenAI
import os

In [53]:
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [51]:
# Importing tkinter to create the GUI
from tkinter import *
from tkinter.font import BOLD

# Create GUI Main Window
root = Tk()
root.title("Movie Reviews Analysis")
root.geometry("1000x600")
root.resizable(width=FALSE,height=FALSE)
root["bg"]="#e74343"

# Welcome header
label1 = Label(root, text="Movie Satisfaction Analysis",  width="12", height=5,bd=0, background="#e74343",foreground='black',font=("Times New Roman", 30))
label1.place(x=155,y=70, height=100, width=700)

label3 = Label(root, text="~~~~ Write down a movie review and get its satisfaction level prediction ~~~~", background="#e74343", width="12", height=5,bd=0,foreground='black',font=("Times New Roman", 14))
label3.place(x=200,y=180, height=40, width=600)

# Where our results will be displayed
label4 = Label(root, text="Output will be displayed here...", background="gray", width="12", height=5,bd=0,foreground='black',font=("Catamaran", 16))
label4.place(x=200,y=405, height=50, width=600)

# Input textbox is created
chatWindow = Text(root, bd=1, bg="white",  width="50", height="8", font=("Times New Roman", 16), foreground="white", padx=10, pady=10,wrap=WORD,background="black",borderwidth=10)
chatWindow.place(x=200,y=250, height=150, width=600)

#This method is used to extract the input from the input box and run it through the model
#After the model predicts the satisfaction level of the user, it prints it onto a label
def display_review(user_review):
    prediction = classifier.classify(word_features(user_review))
    if(prediction == "positive"):
        prediction = "Sentiment: User is happy   Rating: "
    elif(prediction == "negative"):
        prediction = "Sentiment: User is dissatisfied   Rating: "
    
    # Integrate openAI to get the Rating 
    #              ***(UNCOMMENT THE LINES BELOW ONCE YOU INPUT YOUR OWN API key IN .env) ***
    # completion = client.chat.completions.create(
    #     model="gpt-3.5-turbo",
    #     messages=[
    #     {"role": "user", "content": f"You are a data model that is given a review for sentiment analysis. Do analysis on the following text and only reply with a number from 1 to 5, 1 being negative and 5 being positive: {user_review}\nAnswer:"}
    #     ]
    # )

    # urgency = completion.choices[0].message.content
    # prediction += urgency

    label4.config(text = prediction)

# Button is created. Once button is pressed, it will call the display_review() function
review_button = Button(root, text="Review",  width="12", height=5,bd=0, bg="#4285F4", activebackground="white",foreground='white',font=("Times New Roman", 18), command=lambda: display_review(chatWindow.get("1.0", END)))
review_button.place(x=400,y=510, height=50, width=200)

root.mainloop()