In [1]:
import pandas as pd

In [2]:
#get movie dataset
imdb_dataset = pd.read_csv("imdb_labelled.txt",sep = '\t', names = ['comment','label'])

In [3]:
# view 10 observations
# 1 indicates +ve, 0 indicates -ve
imdb_dataset.head(10)

Unnamed: 0,comment,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [6]:
#view more info about the data using describe
imdb_dataset.describe()

Unnamed: 0,label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [9]:
#view more info on data
imdb_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  748 non-null    object
 1   label    748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [10]:
#view data using group by and describe
imdb_dataset.groupby('label').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,10/10,2


In [11]:
#verify length of the messages and add it as a new column(feature)
imdb_dataset['length'] = imdb_dataset['comment'].apply(len)

In [12]:
#view 1st 5 messages
imdb_dataset.head()

Unnamed: 0,comment,label,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [15]:
#view first comment whose length > 50
imdb_dataset[imdb_dataset['length']>50]['comment'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [17]:
# text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [20]:
import string
from nltk.corpus import stopwords
#function to remove stopwords
def remove_stopwords(msg):
    #check character to see if there are panctuations
    no_punctuation = [char for char in msg if char not in string.punctuation]
    #now form the sentence
    no_punctuation  = ''.join(no_punctuation)
    #now eliminate any stopwords
    return [word for word in no_punctuation.split(' ',1) if word.lower() not in stopwords.words('english')]

In [21]:
#verify if function is working
imdb_dataset['comment'].head(5).apply(remove_stopwords)

0    [very very very slowmoving aimless movie about...
1    [sure who was more lost  the flat characters o...
2    [Attempting, artiness with black  white and cl...
3             [little music or anything to speak of  ]
4    [best scene in the movie was when Gerardo is t...
Name: comment, dtype: object

In [22]:
#bag of words by applying the function and fit the data into it
bag_of_words = CountVectorizer(analyzer = remove_stopwords).fit(imdb_dataset['comment'])

In [39]:
#apply transform method for bag of words
comment_bagofwords = bag_of_words.transform(imdb_dataset['comment'])

In [24]:
#use tfidf to fit the transformed bag of words into it
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(comment_bagofwords)

In [28]:
#print shape
tfidf_comment = tfidf_transformer.transform(comment_bagofwords)
tfidf_comment.shape

(748, 905)

In [29]:
#use naive bayes model to detect sentiment and fit transformer data into it
from sklearn.naive_bayes import MultinomialNB
sentiment_detection_model = MultinomialNB().fit(tfidf_comment,imdb_dataset['label'])

In [34]:
# check model for predicted and expected value for a comment
comment = imdb_dataset['comment'][0]
bag_of_words_for_comment = bag_of_words.transform([comment])
tfidf = tfidf_transformer.transform(bag_of_words_for_comment)

print('predicted sentiment: ',sentiment_detection_model.predict(tfidf)[0])
print('expected sentiment:', imdb_dataset.label[0])

predicted sentiment:  0
expected sentiment: 0
