# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Importing the dataset


In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Cleaning the data

In [3]:
nltk.download('stopwords')
corpus = []
for i in range(0, 50000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
print(dataset['review'][1])

A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done.


In [5]:
dataset['review'] = corpus

In [6]:
print(dataset['review'][1])

wonder littl product br br film techniqu unassum old time bbc fashion give comfort sometim discomfort sens realism entir piec br br actor extrem well chosen michael sheen not got polari voic pat truli see seamless edit guid refer william diari entri not well worth watch terrificli written perform piec master product one great master comedi life br br realism realli come home littl thing fantasi guard rather use tradit dream techniqu remain solid disappear play knowledg sens particularli scene concern orton halliwel set particularli flat halliwel mural decor everi surfac terribl well done


In [7]:
tfidf = TfidfVectorizer(max_features=40000)

In [8]:
X = dataset["review"]
Y = dataset['sentiment']

X = tfidf.fit_transform(X)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

# Traning the RF model

In [9]:
classifier = RandomForestClassifier(n_estimators = 250, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

# Confusion matrix

In [10]:
Y_pred = classifier.predict(X_test)

In [11]:
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

accuracy_score(Y_test, Y_pred)

print('--------------------------------------')


print(classification_report(Y_pred,Y_test))

[[4308  727]
 [ 666 4299]]
--------------------------------------
              precision    recall  f1-score   support

    negative       0.86      0.87      0.86      4974
    positive       0.87      0.86      0.86      5026

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [15]:
from sklearn.metrics import precision_score , recall_score, f1_score
print(f1_score(Y_1,Y_2))
print('f1------')
print(recall_score(Y_1,Y_2))
print('RECAL------')
print(precision_score(Y_1,Y_2))
print('PRECISION------')

0.8605745170653589
f1------
0.8658610271903323
RECAL------
0.8553521687226423
PRECISION------


In [13]:
# when review is positive make it =1 , when it is negative make it =0
Y_1 = np.array(list(map(lambda x: 1 if x=="positive" else 0, Y_test)))
Y_2 = np.array(list(map(lambda x: 1 if x=="positive" else 0, Y_pred)))


In [16]:
accuracy_score(Y_test, Y_pred)

0.8607