In [1]:
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('IMDB_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
3,"This show was an amazing, fresh & innovative i...",negative
4,Encouraged by the positive comments about this...,negative


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [4]:
df.sentiment.value_counts()

negative    12500
positive    12500
Name: sentiment, dtype: int64

# Preprocessing

In [5]:
df.review[0]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [6]:
# lowering the case of the reviews
df.review = df.review.str.lower()

In [7]:
# removing any non-word, non-space characters (punctuations)
df.review = df.review.str.replace("[^\w\s]", "")

In [8]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Meenakshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# tokenization
df['review_tokens'] = df.review.apply(word_tokenize)

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwds = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Meenakshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# removing stop words
df['review_tokens'] = df['review_tokens'].apply(lambda x: [word for word in x if word not in stopwds])

In [15]:
# lemmatising the tokens
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['review_tokens_lemma'] = df['review_tokens'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Meenakshi\AppData\Roaming\nltk_data...


In [30]:
df.head()

Unnamed: 0,review,sentiment,review_tokens,review_tokens_lemma
0,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
1,probably my alltime favorite movie a story of ...,positive,"[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se..."
2,i sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ..."
3,this show was an amazing fresh innovative ide...,negative,"[show, amazing, fresh, innovative, idea, 70s, ...","[show, amazing, fresh, innovative, idea, 70, f..."
4,encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...","[encouraged, positive, comment, film, looking,..."


In [37]:
# Performing tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()

tfidf_vect.fit(df['review_tokens_lemma'].apply(lambda x: ' '.join(x)))
X = tfidf_vect.transform(df['review_tokens_lemma'].apply(lambda x: ' '.join(x)))

In [41]:
# Creating the target variable by mapping to 1 & 0
y = df['sentiment'].map({'positive': 1, 'negative': 0})

In [43]:
# splitting data into train and test (50-50 ratio as mentioned in the instructions)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=17)

In [46]:
# funtion to print the results from a cross validation
def print_cv_results(results):
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean, 3)} (+/-{round(std * 2, 3)}) with {params}')
    print(f'\nBEST PARAMS: {results.best_params_}\n')

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

In [54]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
parameters = {
              'n_estimators': [5, 50],
              'max_depth': [2, 4, 8, 16, 32, None]
              }
cv1 = GridSearchCV(rf, parameters, cv=5)
cv1.fit(X_train, y_train.values.ravel())

print_cv_results(cv1)

0.568 (+/-0.035) with {'max_depth': 2, 'n_estimators': 5}
0.692 (+/-0.032) with {'max_depth': 2, 'n_estimators': 50}
0.606 (+/-0.054) with {'max_depth': 4, 'n_estimators': 5}
0.754 (+/-0.017) with {'max_depth': 4, 'n_estimators': 50}
0.63 (+/-0.023) with {'max_depth': 8, 'n_estimators': 5}
0.786 (+/-0.03) with {'max_depth': 8, 'n_estimators': 50}
0.68 (+/-0.023) with {'max_depth': 16, 'n_estimators': 5}
0.808 (+/-0.005) with {'max_depth': 16, 'n_estimators': 50}
0.697 (+/-0.03) with {'max_depth': 32, 'n_estimators': 5}
0.821 (+/-0.019) with {'max_depth': 32, 'n_estimators': 50}
0.702 (+/-0.011) with {'max_depth': None, 'n_estimators': 5}
0.826 (+/-0.01) with {'max_depth': None, 'n_estimators': 50}

BEST PARAMS: {'max_depth': None, 'n_estimators': 50}



In [61]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
parameters = {
#     'n_estimators': [5, 50],
    'max_depth': [3, 5],
    'learning_rate': [1, 10]
}

cv2 = GridSearchCV(gb, parameters, cv=2, verbose=3)
cv2.fit(X_train, y_train.values.ravel())

print_cv_results(cv2)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ......learning_rate=1, max_depth=3;, score=0.791 total time= 1.9min
[CV 2/2] END ......learning_rate=1, max_depth=3;, score=0.800 total time= 1.9min
[CV 1/2] END ......learning_rate=1, max_depth=5;, score=0.794 total time= 2.4min
[CV 2/2] END ......learning_rate=1, max_depth=5;, score=0.797 total time= 2.4min
[CV 1/2] END .....learning_rate=10, max_depth=3;, score=0.328 total time= 2.0min
[CV 2/2] END .....learning_rate=10, max_depth=3;, score=0.495 total time= 1.9min
[CV 1/2] END .....learning_rate=10, max_depth=5;, score=0.498 total time= 2.6min
[CV 2/2] END .....learning_rate=10, max_depth=5;, score=0.502 total time= 2.6min
0.795 (+/-0.01) with {'learning_rate': 1, 'max_depth': 3}
0.795 (+/-0.004) with {'learning_rate': 1, 'max_depth': 5}
0.411 (+/-0.168) with {'learning_rate': 10, 'max_depth': 3}
0.5 (+/-0.005) with {'learning_rate': 10, 'max_depth': 5}

BEST PARAMS: {'learning_rate': 1, 'max_depth': 3}



In [66]:
from time import time
def evaluate(alg, model, X_data, y_data):
    start_time = time()
    y_pred = model.predict(X_data)
    latency = round((time() - start_time)*1000, 2)
    accuracy = round(accuracy_score(y_data, y_pred), 4)
    precision = round(precision_score(y_data, y_pred, average='macro'), 4)
    recall = round(recall_score(y_data, y_pred, average='macro'), 4)
    print(f"Algorithm: {alg}: Accuracy: {accuracy}; Precision: {precision}; Recall: {recall}; Latency: {latency}ms")
    print(confusion_matrix(y_data, y_pred))
    print(classification_report(y_data, y_pred))

evaluate("    Random Forest", cv1.best_estimator_, X_train, y_train)
evaluate("Gradient Boosting", cv2.best_estimator_, X_train, y_train)

Algorithm:     Random Forest: Accuracy: 1.0; Precision: 1.0; Recall: 1.0; Latency: 530.56ms
[[6310    0]
 [   0 6190]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6310
           1       1.00      1.00      1.00      6190

    accuracy                           1.00     12500
   macro avg       1.00      1.00      1.00     12500
weighted avg       1.00      1.00      1.00     12500

Algorithm: Gradient Boosting: Accuracy: 0.9292; Precision: 0.9292; Recall: 0.9293; Latency: 31.25ms
[[5817  493]
 [ 392 5798]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      6310
           1       0.92      0.94      0.93      6190

    accuracy                           0.93     12500
   macro avg       0.93      0.93      0.93     12500
weighted avg       0.93      0.93      0.93     12500



Eventhough the random forest model has better metrics, it is highly overfit to the training data. Hence the best model from the above search is gradient boosting with {'learning_rate': 1, 'max_depth': 3} as parameters. It's performance on the test set is as follows.

In [69]:
evaluate("Best model: Gradient Boosting {'learning_rate': 1, 'max_depth': 3} \n ", cv2.best_estimator_, X_test, y_test)

Algorithm: Best model: Gradient Boosting {'learning_rate': 1, 'max_depth': 3} 
 : Accuracy: 0.8188; Precision: 0.8191; Recall: 0.8186; Latency: 22.89ms
[[4966 1224]
 [1041 5269]]
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      6190
           1       0.81      0.84      0.82      6310

    accuracy                           0.82     12500
   macro avg       0.82      0.82      0.82     12500
weighted avg       0.82      0.82      0.82     12500

