In [1]:
print("OK")

OK


#### Import Necessary Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
import gensim 
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize


### Import the Data

In [3]:
df = pd.read_csv('data/IMDB Dataset.csv')

### Exploring the Data

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
#checking the shape of the data
df.shape

(50000, 2)

In [6]:
#checking duplicates
df[df.duplicated()]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive
6352,If you liked the Grinch movie... go watch that...,negative
6479,I want very much to believe that the above quo...,negative
...,...,...
49912,This is an incredible piece of drama and power...,positive
49950,This was a very brief episode that appeared in...,negative
49984,Hello it is I Derrick Cannon and I welcome you...,negative
49986,This movie is a disgrace to the Major League F...,negative


- __Dropping duplicates as these are same it may create bias__

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.shape

(49582, 2)

In [9]:
df[df.duplicated()]

Unnamed: 0,review,sentiment


### create a function to handle preprocessing


In [10]:
#create a function to handle preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    # Initialize stopwords
    stop_words = set(stopwords.words('english'))
    
    # Tokenize each sentence into words, remove stopwords, and join back to sentences
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word for word in words if word not in stop_words]
        processed_sentences.append(' '.join(words))
    
    # Join the processed sentences back into a single string
    processed_text = ' '.join(processed_sentences)
    
    return processed_text

    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Applied the function on reviews 

In [11]:
df['review']=df['review'].apply(preprocess_text)

## Feature Engineering

In [12]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [13]:
X

Unnamed: 0,review
0,one reviewers mentioned watching oz episode yo...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
49995,thought movie right good job wasnt creative or...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,im going disagree previous comment side maltin...


In [14]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [16]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [17]:
from sklearn.model_selection import  train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((34707, 1), (14875, 1))

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Limiting the number of features
cv = CountVectorizer(max_features=10000)

# Transforming the data
X_train_bow = cv.fit_transform(X_train['review'])
X_test_bow = cv.transform(X_test['review'])

# Converting to dense array if necessary
X_train_bow_dense = X_train_bow.toarray()
X_test_bow_dense = X_test_bow.toarray()

# Checking the shapes
print(X_train_bow_dense.shape)
print(X_test_bow_dense.shape)

(34707, 10000)
(14875, 10000)


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB


In [20]:
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [21]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBClassifier": XGBClassifier(), 
    "GaussianNB":GaussianNB(),
    "MultinomialNB":MultinomialNB()

}
models

{'Random Forest': RandomForestClassifier(),
 'Decision Tree': DecisionTreeClassifier(),
 'Logistic Regression': LogisticRegression(max_iter=1000),
 'XGBClassifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 'GaussianNB': GaussianNB(),
 'MultinomialNB': MultinomialNB()}

In [22]:
import pandas as pd

def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # Separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models_list = []
    accuracy_list = []
    auc_list = []
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)  # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)

        # Test set performance
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)

        print(model_name)
        models_list.append(model_name)

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print('- F1 score: {:.4f}'.format(model_train_f1)) 
        print('- Precision: {:.4f}'.format(model_train_precision))
        print('- Recall: {:.4f}'.format(model_train_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy)
        print('- F1 score: {:.4f}'.format(model_test_f1))
        print('- Precision: {:.4f}'.format(model_test_precision))
        print('- Recall: {:.4f}'.format(model_test_recall))
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        auc_list.append(model_test_rocauc_score)
        print('='*35)
        print('\n')
        
    report = pd.DataFrame({
        'Model Name': models_list,
        'Accuracy': accuracy_list,
        'AUC': auc_list
    }).sort_values(by='Accuracy', ascending=False)
        
    return report


### Model Training


In [23]:
# Evaluate the models
base_model_report = evaluate_models(X=X_train_bow_dense, y=y_train, models=models)

# Display the report
print(base_model_report)


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8459
- F1 score: 0.8427
- Precision: 0.8487
- Recall: 0.8368
- Roc Auc Score: 0.8458


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7178
- F1 score: 0.7150
- Precision: 0.7128
- Recall: 0.7172
- Roc Auc Score: 0.7178


Logistic Regression
Model performance for Training set
- Accuracy: 0.9875
- F1 score: 0.9876
- Precision: 0.9871
- Recall: 0.9880
- Roc Auc Score: 0.9875
----------------------------------
Model performance for Test set
- Accuracy: 0.8666
- F1 score: 0.8650
- Precision: 0.8640
- Recall: 0.8660
- Roc Auc Score: 0.8666


XGBClassifier
Model performance for Training set
