## Import libraries

In [13]:
import pandas as pd
import numpy as np


import os #library in managing directories
import re, string #library in removing special characters

#for text pre-processing
import nltk #natural language tool kit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

from prettytable import PrettyTable

#for model-building
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from sklearn.feature_selection import SelectKBest, chi2

#for feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacklynjoaquin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacklynjoaquin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the Dataset

In [3]:
imdb_data = pd.read_csv('/Users/jacklynjoaquin/Documents/IMDB-project/imdb_data_extended.csv')
print(imdb_data.head())
print(imdb_data.tail())

      id                                               text label  rating
0   4715  For a movie that gets no respect there sure ar...   pos       9
1  12390  Bizarre horror movie filled with famous faces ...   pos       8
2   8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7
3   9063  It's a strange feeling to sit alone in a theat...   pos       8
4   3092  You probably all already know this by now, but...   pos      10
          id                                               text label  rating
49995  11513  With actors like Depardieu and Richard it is r...   neg       1
49996   5409  If you like to get a couple of fleeting glimps...   neg       1
49997  11187  When something can be anything you want it to ...   neg       1
49998   9359  I had heard good things about "States of Grace...   neg       3
49999  11556  Well, this movie actually did have one redeemi...   neg       1


In [None]:
print(imdb_data.dtypes)
print("Data shape: ", imdb_data.shape)

id         int64
text      object
label     object
rating     int64
dtype: object
Data shape:  (50000, 4)


In [None]:
imdb_data['text'][4000]



## Text Pre-processing

In [4]:
# text tokenizing
imdb_data['processed'] = imdb_data['text'].apply(lambda x: nltk.word_tokenize(x))
print(imdb_data)

          id                                               text label  rating   
0       4715  For a movie that gets no respect there sure ar...   pos       9  \
1      12390  Bizarre horror movie filled with famous faces ...   pos       8   
2       8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7   
3       9063  It's a strange feeling to sit alone in a theat...   pos       8   
4       3092  You probably all already know this by now, but...   pos      10   
...      ...                                                ...   ...     ...   
49995  11513  With actors like Depardieu and Richard it is r...   neg       1   
49996   5409  If you like to get a couple of fleeting glimps...   neg       1   
49997  11187  When something can be anything you want it to ...   neg       1   
49998   9359  I had heard good things about "States of Grace...   neg       3   
49999  11556  Well, this movie actually did have one redeemi...   neg       1   

                           

In [5]:
# stop word, sepecial character removal, contraction expansion

stop_words = stopwords.words('english')
new_stopwords = ["would","shall","could","might", 'br']
stop_words.extend(new_stopwords)
stop_words.remove("not")
stop_words = set(stop_words)

def remove_special_char(text):
    clean_text = re.sub(r'[^a-zA-Z\s]','', text)
    return clean_text

def remove_stopwords(text):
    clean_data = []
    for i in text.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

def expand_contractions(text):
    contractions_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "would've": "would have",
    "should've": "should have",
    "hasn't": "has not",
    "haven't": "have not",
    "wasn't": "was not",
    "weren't": "were not",
    "we're": "we are",
    "didn't": "did not",
    "don't": "do not" }

    # Regular expression pattern to find contractions
    contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

    def replace(match):
        return contractions_dict[match.group(0)]

    # Use regular expression to find and replace contractions
    expanded_text = contractions_pattern.sub(replace, text)

    return expanded_text


def data_cleaning(text):
    text = expand_contractions(text)
    text = remove_special_char(text)
    text = remove_stopwords(text)
    return text


imdb_data['processed'] = imdb_data['processed'].apply(lambda x: ' '.join(x))
imdb_data['processed'] = imdb_data['processed'].apply(lambda x: data_cleaning(x))
print(imdb_data.head(), imdb_data.tail())


      id                                               text label  rating   
0   4715  For a movie that gets no respect there sure ar...   pos       9  \
1  12390  Bizarre horror movie filled with famous faces ...   pos       8   
2   8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7   
3   9063  It's a strange feeling to sit alone in a theat...   pos       8   
4   3092  You probably all already know this by now, but...   pos      10   

                                           processed  
0  movie gets respect sure lot memorable quotes l...  
1  bizarre horror movie filled famous faces stole...  
2  solid unremarkable film matthau einstein wonde...  
3  strange feeling sit alone theater occupied par...  
4  probably already know additional episodes neve...             id                                               text label  rating   
49995  11513  With actors like Depardieu and Richard it is r...   neg       1  \
49996   5409  If you like to get a couple of 

## Feature Extraction

In [7]:
#train-test set split
X_train, X_test, y_train, y_test = train_test_split(imdb_data['processed'], imdb_data['label'], test_size=0.3, random_state=0, shuffle=True)
print(X_train)

17967    not long jeff jarrett left wwf good spoke nigh...
32391    loved movie since saw theater wil wheaton favo...
9341     compromised fairly charming film liked art dir...
7929     ralph bakshi films appear like twoedged swords...
46544    roger corman non epic sundry bunch characters ...
                               ...                        
21243    another fine effort america underrated filmmak...
45891    word honor erased vocabularies nations aggrava...
42613    found movie complete waste minutes jones weird...
43567    must rate worst films ever seen nt funny wife ...
2732     not film entertaining excellent comedic acting...
Name: processed, Length: 35000, dtype: object


## TF-IDF feature importance with Logistic Regression

In [14]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=5)
X_train_vectors = tfidf_vectorizer.fit_transform(X_train)
X_test_vectors = tfidf_vectorizer.transform(X_test)

feature_names = tfidf_vectorizer.get_feature_names_out()

In [15]:
print(tfidf_vectorizer.get_feature_names_out()[::100])
len(tfidf_vectorizer.get_feature_names_out())

['aa' 'abundant' 'acknowledges' 'addy' 'adventure' 'afterwards' 'airport'
 'allamerican' 'altitude' 'amoral' 'angel' 'antagonism' 'apes' 'aqua'
 'armed' 'ashamed' 'assures' 'attitudes' 'avalanche' 'babette' 'baking'
 'barbeau' 'basking' 'beastmaster' 'behavioral' 'benito' 'beverages'
 'binoculars' 'blalock' 'blondie' 'bobs' 'boo' 'bothersome' 'braindead'
 'breslin' 'brokeback' 'budget' 'buoyed' 'buzzing' 'cam' 'capes' 'carlton'
 'castaways' 'cd' 'chain' 'charitable' 'cheesiness' 'chipettes' 'chung'
 'clarence' 'cliffhanger' 'cluster' 'coins' 'columnist' 'committee'
 'complicates' 'concoct' 'conjunction' 'constitution' 'contrasted' 'cope'
 'cortez' 'courts' 'crater' 'crippled' 'cruelties' 'cures' 'cyphers'
 'danson' 'days' 'decent' 'defies' 'delving' 'depends' 'desiring'
 'develops' 'dictionary' 'dipping' 'discord' 'disobey' 'distract'
 'documented' 'donuts' 'downside' 'dresser' 'ds' 'duplicitous'
 'earthquakes' 'edward' 'electrocutes' 'emanuelle' 'emptying'
 'englishspeaking' 'entices'

32122

In [12]:
# Calculate sparsity
total_elements = X_train_vectors.shape[0] * X_train_vectors.shape[1]
non_zero_elements = X_train_vectors.nnz
sparsity = 1 - (non_zero_elements / total_elements)

print(f"Sparsity of the Bi-gram matrix: {sparsity:.4f}")

Sparsity of the Bi-gram matrix: 0.9971


## SelectKBest for Dimensionality Reduction

In [25]:
feature_selector = SelectKBest(chi2, k=20000)

X_train_vectors_tfidf = feature_selector.fit_transform(X_train_vectors, y_train)
X_test_vectors_tfidf = feature_selector.transform(X_test_vectors)

selected_feature_indices = feature_selector.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_feature_indices]
print(selected_features)
X_train_vectors_tfidf.shape



(35000, 20000)

In [26]:
# Logistic regression on TF-IDF
lgr = LogisticRegression()
lgr.fit(X_train_vectors_tfidf,y_train)

y_pred = lgr.predict(X_test_vectors_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8921333333333333
Classification Report:
               precision    recall  f1-score   support

         neg       0.90      0.88      0.89      7485
         pos       0.88      0.91      0.89      7515

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



## TF-IDF feature importance on Decision Tree model

In [27]:
d_tree = DecisionTreeClassifier(max_depth=10)
d_tree.fit(X_train_vectors_tfidf,y_train)

y_pred = d_tree.predict(X_test_vectors_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.7270666666666666
Classification Report:
               precision    recall  f1-score   support

         neg       0.82      0.58      0.68      7485
         pos       0.68      0.87      0.76      7515

    accuracy                           0.73     15000
   macro avg       0.75      0.73      0.72     15000
weighted avg       0.75      0.73      0.72     15000



## TF-IDF feature importance on Random Forest model

In [28]:
rcf = RandomForestClassifier()
rcf.fit(X_train_vectors_tfidf,y_train)

y_pred = rcf.predict(X_test_vectors_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8544
Classification Report:
               precision    recall  f1-score   support

         neg       0.85      0.86      0.86      7485
         pos       0.86      0.85      0.85      7515

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



## TF-IDF feature importance on Naive Bayes

In [29]:
mnb = MultinomialNB()
mnb.fit(X_train_vectors_tfidf,y_train)

y_pred = mnb.predict(X_test_vectors_tfidf)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8642666666666666
Classification Report:
               precision    recall  f1-score   support

         neg       0.86      0.87      0.87      7485
         pos       0.87      0.86      0.86      7515

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000

