# IMDB Movie Reviews
* Import Libraries

In [52]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split as tt
import warnings
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
warnings.filterwarnings("ignore")

* Reading the dataset 

In [53]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

* Details about the dataset

In [54]:
print(df.shape)
print(df.info())
print(df.describe())
print(df["sentiment"].value_counts())

* Label Mapping

In [55]:
ma = {1:"positive",0:"negative"}
df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

* Removing Duplicates 
*Resizing the dataset (Reduced the size of the dataset in order to reduce time taken to train and test the model)

In [56]:
df = df.drop_duplicates()
positive = df[df["sentiment"]==1]
negative = df[df["sentiment"]==0]
print(df.shape)
print(positive.shape)
print(negative.shape)
df = pd.concat([positive.iloc[:2500,],negative.iloc[:2500,]],axis=0)
print(df.shape)
print(df["sentiment"].value_counts())

# Stemming the content using PorterStemmer

In [57]:
# Stemming
words = stopwords.words("english")

port_stem = PorterStemmer()

def stemming(contents):
    # Replacing anything that isn't an alphabet(text) with ' '
    stemmed_contents = re.sub(r'[^a-zA-Z]',' ',contents)
    # Converting entire text to lower
    stemmed_contents = stemmed_contents.lower()
    # Converting the text into a list of words
    stemmed_contents = stemmed_contents.split()
    # Removing stopwords
    stemmed_contents = [port_stem.stem(word) for word in stemmed_contents if word not in words]
    stemmed_contents = ' '.join(stemmed_contents)
    return stemmed_contents


df["review"] = df["review"].apply(stemming)

* Data Initialization

In [58]:
m = df["review"].values
print(m)
n = df["sentiment"]
print(n)

# Vectorizing the data and Data Initialization


In [59]:
vectorizer = TfidfVectorizer()
m = vectorizer.fit_transform(m)
print(m)

* Splitting data for training and testing

In [60]:
m_train, m_test, n_train, n_test = tt(m,n,train_size=0.78,stratify=n,random_state=0)

* Different ML Models

In [61]:
models = [RandomForestClassifier(random_state=0), SVC(random_state=0),
          LogisticRegression(random_state=0), MLPClassifier(random_state=0)]

# Model Training and Testing

In [62]:
models_train_pred = [[],[],[],[]]
models_test_pred = [[],[],[],[]]

def model_fitting():
    for i in range(0,4):
        clf = models[i]
        clf.fit(m_train, n_train)
        
        # Results
        clf_train_pred = clf.predict(m_train)
        clf_test_pred = clf.predict(m_test)
        models_train_pred[i] = clf_test_pred
        models_test_pred[i] = clf_test_pred
        # Accuracy Score
        train_acc = accuracy_score(n_train, clf_train_pred)
        print("Training Accuracy for",models[i],":",train_acc)
        test_acc = accuracy_score(n_test, clf_test_pred)
        print("Testing Accuracy for",models[i],":",test_acc)
        print(" ")
        
model_fitting()

# Confusion Matrices with heatmap

In [63]:

def conf_mat():
    for i in range(0,4):
        cm = confusion_matrix(n_test,models_test_pred[i])
        tn, fp, fn, tp  = confusion_matrix(n_test,models_test_pred[i]).ravel()
        print(cm)
        recall = tp/(tp+fn)
        precision = tp/(tp+fp)
        f1_score = 2*(precision*recall)/(precision+recall)
        print("Recall for",models[i],":",recall)
        print("Precision for",models[i],":",precision)
        print("F1 Score for",models[i],":",f1_score)
        
        # Heatmap
        
        plt.figure(figsize=(8, 8))
        sns.set()
        sns.heatmap(cm, cbar=True, square=True, annot=True, fmt='.2f', annot_kws={'size': 10}, cmap="Blues")
        plt.title(str(models[i])+" Heatmap")
        plt.show()
    
conf_mat()



# Hyperparameter Tuning

In [64]:
param_grid1 = {"n_estimators":[20,40,60,80],"max_depth":list(range(2,10))}
param_grid2 = {"kernel":["linear","rbf","poly","sigmoid"],"degree":[2,3,4],"C":[0.5,1,10]}
param_grid3 = {"C":[0.5,1,10]}
param_grid4 = {"hidden_layer_sizes": [(10,30,10),(20,)],"alpha": [0.0001, 0.05],'max_iter':[500,1000],
                "learning_rate": ['constant','adaptive'],"activation": ['relu','softmax','tanh']}

p_grid = {0:param_grid1,1:param_grid2,2:param_grid3,3:param_grid4}
gp_grid = [param_grid1,param_grid2,param_grid3,param_grid4]

* RandomizedSearchCV

In [65]:
def h_tuning():
    for k in range(0,4):
        rscv = RandomizedSearchCV(models[k],param_distributions=p_grid[k],cv=5,n_jobs=-1)
        rscv.fit(m_train,n_train)
        print(models[k],"best parameters are:",rscv.best_params_)
        print(models[k],"best score are:",rscv.best_score_)
#         gscv = GridSearchCV(models[k],param_grid=p_grid[k],cv=5,n_jobs=-1,verbose = 0,return_train_score=True)
#         gscv.fit(m_train,n_train)
#         print(models[k],"best parameters are:",gscv.best_params_)
#         print(models[k],"best score are:",gscv.best_score_)

h_tuning()

* Prediction using SVC model(ml model with highest best score)

In [66]:
df1 = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
inp = df1.iloc[6666,]

# Final ML Model
clf = SVC(kernel='rbf',degree=2,C=10,random_state=0)
clf.fit(m_train, n_train)

inp["review"] = stemming(inp["review"])
ip=[inp["review"]]
# Vectorize 
ip = vectorizer.transform(ip)

# Check Prediction
actual_val = inp["sentiment"]
# print("Actual Value:"actual_val)
pred = clf.predict(ip)
# print("Prediction:"pred[0])
if actual_val == ma[pred[0]]:
    print("Accurate Prediction")
    print("Sentiment of the input review is",ma[pred[0]])
else:
    print("Wrong Prediction")
