In [1]:
# Import required libraries
import pandas as pd

In [2]:
# Loading the data file
reviews = pd.read_csv('C:/Users/vicky/Downloads/imdb_labelled.txt', sep = '\t', names = ['comment','response'])

In [3]:
# Show top 5 records
reviews.head()

Unnamed: 0,comment,response
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
# Statistical Summary
reviews.describe()

Unnamed: 0,response
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [5]:
# View more info on data
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comment   748 non-null    object
 1   response  748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [6]:
# View data using groupby and describe method
reviews.groupby('response').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [7]:
# verify length of the reviews and add it also as a new column

reviews['length'] = reviews['comment'].apply(len)

In [8]:
# Show dataframe with length column added
reviews.head()

Unnamed: 0,comment,response,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [9]:
# Apply a transformer and fit the data in the bag of words

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [10]:
# define a function to get rid of stopwords present in the messages

def message_text_process(mess):
    # check parameters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    
    # now form the sentence
    no_punctuation = ''.join(no_punctuation)
    
    # now eliminate stopwords
    return[word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [11]:
# bag of words by applying function and fit and transform the comment into it

import string
from nltk.corpus import stopwords
bag_of_words = CountVectorizer(analyzer=message_text_process).fit(reviews['comment'])

In [12]:
comment_bag_of_words = bag_of_words.transform(reviews['comment'])

In [13]:
# apply tfidf transformer and fit and transform the bag of words into it
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(comment_bag_of_words)

In [14]:
comment_tfidf = tfidf_transformer.transform(comment_bag_of_words)

In [15]:
# Print shape of tfidf

comment_tfidf.shape

(748, 3259)

In [16]:
# choose Naïve Bayes model and fit the tfidf data into it

from sklearn.naive_bayes import MultinomialNB
sentiment_detector = MultinomialNB().fit(comment_tfidf,reviews['response'])

In [17]:
# Check model for predicted and expected response for comment #1

comment = reviews['comment'][0]
bag_of_words_for_comment = bag_of_words.transform([comment])
tfidf = tfidf_transformer.transform(bag_of_words_for_comment)

print('Predicted sentiment response', sentiment_detector.predict(tfidf)[0])
print('expected sentiment response', reviews['response'][0])

Predicted sentiment response 0
expected sentiment response 0


In [18]:
# Check model for predicted and expected response for comment #2

comment = reviews['comment'][2]
bag_of_words_for_comment = bag_of_words.transform([comment])
tfidf = tfidf_transformer.transform(bag_of_words_for_comment)

print('Predicted sentiment response', sentiment_detector.predict(tfidf)[0])
print('expected sentiment response', reviews['response'][2])

Predicted sentiment response 0
expected sentiment response 0
