In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re


In [None]:
data = pd.read_csv('/content/drive/MyDrive/IMDB Project/IMDB Dataset.csv')

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [None]:
# Checking null values
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Dividing the datasets for analysis and modelling

In [None]:
Y = data['sentiment']

In [None]:
X = data.drop(['sentiment'], axis=1)

In [None]:
X.shape, Y.shape

((50000, 1), (50000,))

In [None]:
Y.value_counts(normalize=True)

negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [None]:
Y.replace({'positive':1, 'negative':0}, inplace=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
pip install nltk



In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
ps = PorterStemmer()
lt = WordNetLemmatizer()

In [None]:
# Removing the stopwords
corpus = []
for i in range(0,len(X)):
  review = re.sub("[^a-zA-Z]", " ", X['review'][i])
  review = review.lower()
  review = review.split()
  review = [lt.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = " ".join(review)
  corpus.append(review)


In [None]:
X['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [None]:
corpus[1]

'wonderful little production br br filming technique unassuming old time bbc fashion give comforting sometimes discomforting sense realism entire piece br br actor extremely well chosen michael sheen got polari voice pat truly see seamless editing guided reference williams diary entry well worth watching terrificly written performed piece masterful production one great master comedy life br br realism really come home little thing fantasy guard rather use traditional dream technique remains solid disappears play knowledge sens particularly scene concerning orton halliwell set particularly flat halliwell mural decorating every surface terribly well done'

# Vectorzing the words
Vect = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = Vect.fit_transform(corpus).toarray()

X.shape

# Spliiting the dataset for modelling

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

Vect.get_feature_names()[:50]

Vect.get_params()

Checking the training dataset

final_df = pd.DataFrame(x_train, columns = Vect.get_feature_names())

final_df.head()

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

classifier_1 = SVC() 
classifier_2 = MultinomialNB()
classifier_3 = LogisticRegression()

classifier_2.fit(x_train, y_train)

x_test.shape

pred2 = classifier_2.predict(x_test)

pred2

pred_prob = classifier_2.predict_proba(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score

print("Accuracy score with Multinomial Naive Bayes : {:.4f}".format(accuracy_score(y_test,pred2)))


print("The classification_report:")
print(classification_report(y_test,pred2))

precision_score(y_test, pred2)

confusion_matrix(y_test, pred2)

classifier_3.fit(x_train,y_train)

pred3 = classifier_3.predict(x_test)

accuracy_score(y_test,pred3)

pred_prob = classifier_3.predict_proba(x_test)

pred_prob

Checking the metrics


# Function to calculate Precision and Recall

def calc_precision_recall(y_true, y_pred):
    
    # Convert predictions to series with index matching y_true
    y_pred = pd.Series(y_pred, index=y_true.index)
    
    # Instantiate counters
    TP = 0
    FP = 0
    FN = 0

    # Determine whether each prediction is TP, FP, TN, or FN
    for i in y_true.index: 
        if y_true[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_true[i]!=y_pred[i]:
           FP += 1
        if y_pred[i]==0 and y_test[i]!=y_pred[i]:
           FN += 1
    
    # Calculate true positive rate and false positive rate
    # Use try-except statements to avoid problem of dividing by 0
    try:
        precision = TP / (TP + FP)
    except:
        precision = 1
    
    try:
        recall = TP / (TP + FN)
    except:
        recall = 1

    return precision, recall

# Checking the thresholds for better prediction

lr_proba = pred_prob[:,1]

# Defining probability thresholds to use between 0 and 1
#prob_thres = np.linspace(0,1,num=100)

x_test_pred=[]
  
for l in lr_proba:
  if l>0.50:
   x_test_pred.append(1)
  if l<0.50:
   x_test_pred.append(0)

accuracy_score(y_test,x_test_pred)

print(classification_report(y_test,x_test_pred))

confusion_matrix(y_test, x_test_pred, labels=[0,1])

from sklearn.metrics import plot_precision_recall_curve

plot_precision_recall_curve(classifier_3, x_test, y_test, name = 'Logistic Regression');

# Hyperparameter Tuning

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
param_grid = {"C":np.logspace(-2,3,500), "penalty":['l1','l2'], 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter':[200]}

tuned_logit = RandomizedSearchCV(classifier_3, param_grid , cv=skf, random_state=17)
tuned_logit.fit(x_train,y_train)

tuned_logit.best_params_, tuned_logit.best_score_

#tuned_logit.C_

p = tuned_logit.predict(x_test)

accuracy_score(y_test, p)

q = tuned_logit.predict_proba(x_test)

lr_proba_2 = q[:,1]

# Defining probability thresholds to use between 0 and 1
#prob_thres = np.linspace(0,1,num=100)

x_test_pred_2=[]
  
for l in lr_proba_2:
  if l>0.50:
   x_test_pred_2.append(1)
  if l<0.50:
   x_test_pred_2.append(0)

accuracy_score(y_test, x_test_pred_2)

print(classification_report(y_test, x_test_pred_2))

## Using Bidirectional RNN with LSTM using word embeddings

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
# Converting sentences in corpus into a one_hot feature vector
one_hot_repr = [one_hot(sent,10000) for sent in corpus]
print(one_hot_repr[0]) 

[8695, 6292, 2260, 5915, 4876, 3681, 2723, 3791, 305, 6065, 8107, 8107, 2634, 2499, 7248, 4876, 7274, 6230, 7023, 7326, 2847, 3791, 300, 7536, 9373, 153, 3552, 684, 1638, 153, 9600, 199, 404, 285, 3227, 7326, 4493, 7501, 5546, 300, 8107, 8107, 52, 4876, 2981, 5179, 4183, 539, 7010, 2477, 3575, 3426, 162, 4893, 7421, 2912, 8256, 2603, 117, 5615, 6571, 5783, 6870, 5904, 5607, 9149, 581, 7421, 2871, 1133, 6399, 33, 6441, 3031, 7816, 4009, 3029, 8999, 9338, 1903, 9135, 8999, 3470, 9025, 7915, 5799, 3266, 8107, 8107, 1287, 7386, 9072, 3559, 153, 9232, 8710, 7536, 153, 9235, 5344, 9842, 8218, 8554, 5973, 426, 5344, 1423, 5344, 6896, 4876, 9995, 4378, 2634, 3681, 5584, 3760, 7248, 3872, 2160, 7386, 8716, 2462, 8178, 8081, 4876, 1332, 882, 5607, 360, 4187, 7326, 7326, 5482, 2086, 6873, 292, 4137, 5409, 4829, 7024, 9478, 3266, 5557, 5210, 3741, 7418, 5409, 843, 2603, 1715, 9232, 1447, 1669, 1234, 2603, 9859, 5915, 4876, 6398, 1370, 36, 725, 486, 1966, 9478, 2104, 6009, 3735]


In [None]:
with open('one_hot_transform.pkl', 'wb') as f:
  pickle.dump(one_hot_repr,f)

In [None]:
# Making each sentence into same length
sent_length = len(max(corpus, key=len)) #Finding the max length of a string in corpus
embedded_repr = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)
print(embedded_repr)

[[   0    0    0 ... 2104 6009 3735]
 [   0    0    0 ... 3079 5557 7268]
 [   0    0    0 ... 7536 2824 2287]
 ...
 [   0    0    0 ... 5328 4327 9364]
 [   0    0    0 ... 4591 2257  567]
 [   0    0    0 ... 7959 1073 2122]]


In [None]:
embedded_repr[0]

array([   0,    0,    0, ..., 2104, 6009, 3735], dtype=int32)

In [None]:
pickle.dump(embedded_repr, open('embedded_repr.pkl','wb'))

## Modelling

In [None]:
from tensorflow.keras.layers import Dense, Flatten, Dropout

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [None]:
model = Sequential()
# Adding Word Embedding layer
model.add(Embedding(10000,10, input_length=sent_length))
# Adding Bidirectional LSTM layer
model.add(layers.Bidirectional(layers.LSTM(100)))
# Adding output layer
model.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
# Comipiling the model (adding optimizer, loss function, and required metrics)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
X_final = np.array(embedded_repr)
Y_final = np.array(Y)

In [None]:
X_final.shape, Y_final.shape

((50000, 9168), (50000,))

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_final, Y_final, test_size=0.33)

In [None]:
# Fitting the model
model_history = model.fit(x_train, y_train, validation_data=(x_test,y_test), batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Saving the model
model.save('nlp1_model.h5')

In [None]:
from tensorflow.keras.models import load_model

model_3 = load_model('nlp1_model.h5')

In [None]:
import numpy as np

In [None]:
sent = ['The movie is good']

one_hot_repr_2 = [one_hot(sent,10000) for sent in sent]

sent_length_2 = len(max(sent, key=len)) #Finding the max length of a string in corpus
embedded_repr_2 = pad_sequences(one_hot_repr_2, padding='pre', maxlen=9168)

sent_final = np.array(embedded_repr_2)

In [None]:
pred = model_3.predict(sent_final)

In [None]:
pred = (pred>0.5)

In [None]:
pred

array([[ True]])