In [2]:
import pandas as pd
import numpy as np
import text_normalizer as tn

In [4]:
dataset = pd.read_csv(r'IMDB Dataset.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [6]:
norm_train_reviews[2]

'I think wonderful way spend time hot summer weekend sit air condition theater watch light hearted comedy plot simplistic dialogue witty character likable even well bread suspect serial killer may disappoint realize not match point 2 risk addiction I think proof woody allen still fully control style many grow love I would laugh one woodys comedy year dare I say decade I never impressed scarlet johanson manage tone sexy image jump right average spirited young woman may not crown jewel career witty devil wear prada interesting superman great comedy go see friend'

In [7]:
# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [8]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35000, 2109532)  Test features shape: (15000, 2109532)
TFIDF model:> Train features shape: (35000, 2109532)  Test features shape: (15000, 2109532)


In [9]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1)
svm = SGDClassifier(loss='hinge', max_iter=100)

In [10]:
# Logistic Regression model on BOW features
# Please Note : the module meu is not been provided. 
import model_evaluation_utils as meu
lr_bow_predictions = meu.train_predict_model(classifier=lr, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_bow_predictions,classes=['positive', 'negative'])



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance metrics:
------------------------------
Accuracy: 0.9061
Precision: 0.9062
Recall: 0.9061
F1 Score: 0.9061

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.90      0.91      0.91      7510
    negative       0.91      0.90      0.91      7490

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6839      671
        negative        737     6753


In [11]:
# Logistic Regression model on TF-IDF features
# Please Note : the module meu is not been provided.
lr_tfidf_predictions = meu.train_predict_model(classifier=lr, 
                                               train_features=tv_train_features, train_labels=train_sentiments,
                                               test_features=tv_test_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=lr_tfidf_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8939
Precision: 0.894
Recall: 0.8939
F1 Score: 0.8939

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.89      0.90      0.89      7510
    negative       0.90      0.89      0.89      7490

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6770      740
        negative        851     6639


In [12]:
# SVM model on BOW features
# Please Note : the module meu is not been provided.
svm_bow_predictions = meu.train_predict_model(classifier=svm, 
                                             train_features=cv_train_features, train_labels=train_sentiments,
                                             test_features=cv_test_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_bow_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8982
Precision: 0.8982
Recall: 0.8982
F1 Score: 0.8982

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.90      0.89      0.90      7510
    negative       0.90      0.90      0.90      7490

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6721      789
        negative        738     6752


In [13]:
# SVM model on TF-IDF features
# Please Note : the module meu is not been provided.
svm_tfidf_predictions = meu.train_predict_model(classifier=svm, 
                                                train_features=tv_train_features, train_labels=train_sentiments,
                                                test_features=tv_test_features, test_labels=test_sentiments)
meu.display_model_performance_metrics(true_labels=test_sentiments, 
                                      predicted_labels=svm_tfidf_predictions,
                                      classes=['positive', 'negative'])

# THE BELOW O/P SHOULD GIVE YOU A FAIR IDEA ON WHAT :
# methods like 
# train_predict_model() are doing and printing as o/p.
# display_model_performance_metrics() are doing and printing as o/p.

# As an Intern you are not suppose to produce the exact o/p 
# You may only code the minimum required metrics which helps you to 
# compare the different ML models.

Model Performance metrics:
------------------------------
Accuracy: 0.8956
Precision: 0.8961
Recall: 0.8956
F1 Score: 0.8956

Model Classification report:
------------------------------
              precision    recall  f1-score   support

    positive       0.88      0.91      0.90      7510
    negative       0.91      0.88      0.89      7490

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


Prediction Confusion Matrix:
------------------------------
                 Predicted:         
                   positive negative
Actual: positive       6864      646
        negative        920     6570
