## Libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

### IMDB Dataset

In [2]:
data = pd.read_csv("dataset/imdb_dataset.csv")

In [3]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
data["sentiment"].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
print("Review :\n"+data['review'][0])
print("Sentiment :\n"+data['sentiment'][0])

Review :
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due t

In [6]:
print("Review :\n"+data['review'][3])
print("Sentiment :\n"+data['sentiment'][3])

Review :
Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.
Sentiment :
negative


## Data Extraction and Pre-processing

### Removal of HTML Tags and Noise Text

In [7]:
def removeHTMLtags(txt):
    return BeautifulSoup(txt, "html.parser").get_text()

data['review']=data['review'].apply(removeHTMLtags)

In [8]:
# #Text between Square Brackets present??
# for i in range(len(data)):
#     match = re.match('\[[^]]*\]', data["review"][i])
#     if match:
#         print(i,data["review"][i],"\n\n")

In [9]:
def removeSquareBrackets(text):
    return re.sub('\[[^]]*\]', '', text)
data['review']=data['review'].apply(removeSquareBrackets)

In [10]:
def removeSpecialChar(text):
    return re.sub('[^a-zA-z0-9\s]|[_]','',text)

data['review']=data['review'].apply(removeSpecialChar)

In [11]:
print("Updated Review :\n"+data['review'][0])

Updated Review :
One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked They are right as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to manyAryans Muslims gangstas Latinos Christians Italians Irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare Forg

### Text Normalization using Stemming

In [12]:
def stemming(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

data['review']=data['review'].apply(stemming)

### Tokenization and removal of stopwords

In [13]:
stp_list=nltk.corpus.stopwords.words('english')
stp_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
tokenizer=ToktokTokenizer()

def removeStopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    updatedTokens = []
    for token in tokens: 
        if token.lower() not in stp_list:
            updatedTokens.append(token)
    return ' '.join(updatedTokens)   

data['review']=data['review'].apply(removeStopwords)

#### Encoding positive as 1 and negative as 0

In [15]:
le = preprocessing.LabelEncoder()
data['sentiment'] = le.fit_transform(data['sentiment'])
data['sentiment']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int32

## Splitting data into Training and Testing data

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data.review,data.sentiment,train_size = 0.9)

## Feature Extraction using TF-IDF

In [17]:
vect=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

training_data=vect.fit_transform(X_train)
testing_data=vect.transform(X_test)

In [18]:
print(training_data.shape)
print(testing_data.shape)

(45000, 6880122)
(5000, 6880122)


## Training and Building the Model for classification

In [19]:
mnb = MultinomialNB()
mnb.fit(training_data,y_train)

MultinomialNB()

In [20]:
predicted_data = mnb.predict(testing_data)
print(predicted_data)

[0 0 1 ... 1 1 1]


## Testing and Evaluating the model 

In [21]:
mnb.score(testing_data,y_test)

0.7496

In [22]:
accuracy_score(y_test,predicted_data)

0.7496

In [23]:
print(classification_report(y_test,predicted_data,target_names=['Positive','Negative']))

              precision    recall  f1-score   support

    Positive       0.75      0.76      0.75      2517
    Negative       0.75      0.74      0.75      2483

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.75      0.75      0.75      5000

