## Import libraries

In [1]:
import pandas as pd
import numpy as np

import os #library in managing directories
import re, string #library in removing special characters

#for text pre-processing
import nltk #natural language tool kit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

#for model-building
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from sklearn.feature_selection import SelectKBest, chi2

#for feature extraction
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacklynjoaquin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacklynjoaquin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the Dataset

In [2]:
imdb_data = pd.read_csv('/Users/jacklynjoaquin/Documents/IMDB-project/imdb_data_extended.csv')
print(imdb_data.head())
print(imdb_data.tail())

      id                                               text label  rating
0   4715  For a movie that gets no respect there sure ar...   pos       9
1  12390  Bizarre horror movie filled with famous faces ...   pos       8
2   8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7
3   9063  It's a strange feeling to sit alone in a theat...   pos       8
4   3092  You probably all already know this by now, but...   pos      10
          id                                               text label  rating
49995  11513  With actors like Depardieu and Richard it is r...   neg       1
49996   5409  If you like to get a couple of fleeting glimps...   neg       1
49997  11187  When something can be anything you want it to ...   neg       1
49998   9359  I had heard good things about "States of Grace...   neg       3
49999  11556  Well, this movie actually did have one redeemi...   neg       1


In [5]:
print(imdb_data.dtypes)
print("Data shape: ", imdb_data.shape)

id         int64
text      object
label     object
rating     int64
dtype: object
Data shape:  (50000, 4)


In [6]:
imdb_data['text'][4000]



## Text Pre-processing

In [3]:
#text tokenizing
imdb_data['processed'] = imdb_data['text'].apply(lambda x: nltk.word_tokenize(x))
print(imdb_data)

          id                                               text label  rating   
0       4715  For a movie that gets no respect there sure ar...   pos       9  \
1      12390  Bizarre horror movie filled with famous faces ...   pos       8   
2       8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7   
3       9063  It's a strange feeling to sit alone in a theat...   pos       8   
4       3092  You probably all already know this by now, but...   pos      10   
...      ...                                                ...   ...     ...   
49995  11513  With actors like Depardieu and Richard it is r...   neg       1   
49996   5409  If you like to get a couple of fleeting glimps...   neg       1   
49997  11187  When something can be anything you want it to ...   neg       1   
49998   9359  I had heard good things about "States of Grace...   neg       3   
49999  11556  Well, this movie actually did have one redeemi...   neg       1   

                           

In [4]:
#stop word, white space, special character removal, contraction expansion, lowercase transformation

stop_words = stopwords.words('english')
new_stopwords = ["would","shall","could","might", 'br']
stop_words.extend(new_stopwords)
stop_words.remove("not")
stop_words = set(stop_words)

def remove_special_char(text):
    clean_text = re.sub(r'[^a-zA-Z\s]','', text)
    return clean_text

def remove_stopwords(text):
    clean_data = []
    for i in text.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

def expand_contractions(text):
    contractions_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "would've": "would have",
    "should've": "should have",
    "hasn't": "has not",
    "haven't": "have not",
    "wasn't": "was not",
    "weren't": "were not",
    "we're": "we are",
    "didn't": "did not",
    "don't": "do not",
    "it's": "it is" }

    #regular expression pattern to find contractions
    contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

    def replace(match):
        return contractions_dict[match.group(0)]

    #Use regular expression to find and replace contractions
    expanded_text = contractions_pattern.sub(replace, text)

    return expanded_text


def data_cleaning(text):
    text = expand_contractions(text)
    text = remove_special_char(text)
    text = remove_stopwords(text)
    return text


imdb_data['processed'] = imdb_data['processed'].apply(lambda x: ' '.join(x))
imdb_data['processed'] = imdb_data['processed'].apply(lambda x: data_cleaning(x))
print(imdb_data.head(), imdb_data.tail())


      id                                               text label  rating   
0   4715  For a movie that gets no respect there sure ar...   pos       9  \
1  12390  Bizarre horror movie filled with famous faces ...   pos       8   
2   8329  A solid, if unremarkable film. Matthau, as Ein...   pos       7   
3   9063  It's a strange feeling to sit alone in a theat...   pos       8   
4   3092  You probably all already know this by now, but...   pos      10   

                                           processed  
0  movie gets respect sure lot memorable quotes l...  
1  bizarre horror movie filled famous faces stole...  
2  solid unremarkable film matthau einstein wonde...  
3  strange feeling sit alone theater occupied par...  
4  probably already know additional episodes neve...             id                                               text label  rating   
49995  11513  With actors like Depardieu and Richard it is r...   neg       1  \
49996   5409  If you like to get a couple of 

## Feature Extraction

In [5]:
#train-test set split
X_train, X_test, y_train, y_test = train_test_split(imdb_data['processed'], imdb_data['label'], test_size=0.3, random_state=0, shuffle=True)
print(X_train)

17967    not long jeff jarrett left wwf good spoke nigh...
32391    loved movie since saw theater wil wheaton favo...
9341     compromised fairly charming film liked art dir...
7929     ralph bakshi films appear like twoedged swords...
46544    roger corman non epic sundry bunch characters ...
                               ...                        
21243    another fine effort america underrated filmmak...
45891    word honor erased vocabularies nations aggrava...
42613    found movie complete waste minutes jones weird...
43567    must rate worst films ever seen nt funny wife ...
2732     not film entertaining excellent comedic acting...
Name: processed, Length: 35000, dtype: object


## Bi-gram

In [6]:
#Count Vectorizer with Bi-gram
Vect = CountVectorizer(min_df= 5, ngram_range = (2,2))
X_train_vector = Vect.fit_transform(X_train)
X_test_vector = Vect.transform(X_test)

feature_names = Vect.get_feature_names_out()

In [7]:
print(Vect.get_feature_names_out()[::2000])
len(Vect.get_feature_names_out())
X_train_vector.shape

['aamir khan' 'alien resurrection' 'anyone remember' 'bad poor'
 'billy halop' 'captain america' 'child prodigy' 'conveyed film'
 'definitely made' 'dubbed sound' 'enough worth' 'exactly wrong'
 'feel worse' 'film within' 'four major' 'getting worse' 'got minutes'
 'hell got' 'imdb says' 'keep show' 'least entertaining' 'like without'
 'love must' 'many american' 'modern hollywood' 'movie tells'
 'needed something' 'not series' 'old tradition' 'park not'
 'plane crashes' 'probably way' 'really explored' 'role see'
 'screaming like' 'sequences involving' 'sing song' 'star although'
 'style filming' 'theater version' 'time rent' 'tv late' 'violence rape'
 'well animated' 'work truly']


(35000, 89528)

## SelectKBest for Dimensionality Reduction

In [8]:
feature_selector = SelectKBest(chi2, k=50000)

X_train_vect = feature_selector.fit_transform(X_train_vector, y_train)
X_test_vect = feature_selector.transform(X_test_vector)

selected_feature_indices = feature_selector.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_feature_indices]
print(selected_features[::2000])
X_train_vect.shape

['aamir khan', 'angles keeping', 'believe better', 'cast great', 'cool got', 'downright awful', 'even true', 'film dvd', 'full bad', 'gratuitous nudity', 'hurts film', 'lack character', 'location work', 'material director', 'movie sounds', 'not intelligent', 'one winner', 'plot points', 'really seeing', 'scifi movie', 'silly nt', 'story new', 'thinking get', 'type horror', 'well chosen']


(35000, 50000)

## Bi-gram feature importance with Logistic Regression

In [9]:
# Logistic regression for Bi-gram
lgr = LogisticRegression()
lgr.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(lgr, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = lgr.predict(X_test_vect)

accuracy = lgr.score(X_test_vect, y_test)
report = classification_report(y_test, y_pred)
print("Classification report:\n", report)
print("Accuracy:", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1: Accuracy = 0.8859
Fold 2: Accuracy = 0.8939
Fold 3: Accuracy = 0.8927
Fold 4: Accuracy = 0.8820
Fold 5: Accuracy = 0.8817
Variance of accuracy scores: 0.0000
Classification report:
               precision    recall  f1-score   support

         neg       0.87      0.85      0.86      7485
         pos       0.86      0.87      0.86      7515

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000

Accuracy: 0.8619333333333333


## Bi-gram feature importance on Naive Bayes

In [10]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vect, y_train)

#cross-validation on training data
scores = cross_val_score(nb_classifier, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = nb_classifier.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.9099
Fold 2: Accuracy = 0.9197
Fold 3: Accuracy = 0.9181
Fold 4: Accuracy = 0.9104
Fold 5: Accuracy = 0.9163
Variance of accuracy scores: 0.0000
Accuracy: 0.8697333333333334
Classification Report:
               precision    recall  f1-score   support

         neg       0.87      0.87      0.87      7485
         pos       0.87      0.87      0.87      7515

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



## Bi-gram feature importance with Random Forest

In [11]:
rfc = RandomForestClassifier()
rfc.fit(X_train_vect, y_train)

#cross-validation on training data
scores = cross_val_score(rfc, X_train_vect, y_train, cv=5)

# Print accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = rfc.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.7887
Fold 2: Accuracy = 0.8054
Fold 3: Accuracy = 0.8053
Fold 4: Accuracy = 0.8066
Fold 5: Accuracy = 0.7956
Variance of accuracy scores: 0.0000
Accuracy: 0.8008666666666666
Classification Report:
               precision    recall  f1-score   support

         neg       0.78      0.83      0.81      7485
         pos       0.82      0.77      0.79      7515

    accuracy                           0.80     15000
   macro avg       0.80      0.80      0.80     15000
weighted avg       0.80      0.80      0.80     15000



## Bi-gram feature importance on Decision Tree

In [12]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_vect, y_train)

#cross-validation on training data
scores = cross_val_score(decision_tree, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = decision_tree.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.7229
Fold 2: Accuracy = 0.7264
Fold 3: Accuracy = 0.7290
Fold 4: Accuracy = 0.7390
Fold 5: Accuracy = 0.7237
Variance of accuracy scores: 0.0000
Accuracy: 0.7410666666666667
Classification Report:
               precision    recall  f1-score   support

         neg       0.73      0.76      0.74      7485
         pos       0.75      0.72      0.74      7515

    accuracy                           0.74     15000
   macro avg       0.74      0.74      0.74     15000
weighted avg       0.74      0.74      0.74     15000



## Uni-gram

In [13]:
Vect = CountVectorizer(min_df= 5, ngram_range = (1,1))
X_train_vect_uni = Vect.fit_transform(X_train)
X_test_vect_uni = Vect.transform(X_test)

feature_names = Vect.get_feature_names_out()

In [14]:
print(Vect.get_feature_names_out()[::500])
len(Vect.get_feature_names_out())

['aa' 'afterwards' 'angel' 'ashamed' 'baking' 'benito' 'bobs' 'brokeback'
 'capes' 'charitable' 'cliffhanger' 'complicates' 'cope' 'cruelties'
 'decent' 'develops' 'distract' 'ds' 'emanuelle' 'ethel' 'fahrenheit'
 'finn' 'founders' 'gavin' 'gotcha' 'haim' 'helicopters' 'horny'
 'impatience' 'insecure' 'jacko' 'kazan' 'large' 'linnea' 'macintosh'
 'materialistic' 'mikes' 'moron' 'nearing' 'nuts' 'ossessione' 'parrish'
 'phases' 'poole' 'proceeds' 'questionable' 'recount' 'repulsive' 'roared'
 'samourai' 'secretly' 'sheriff' 'skinned' 'sommer' 'stakes' 'stubbornly'
 'swedes' 'tepper' 'tonic' 'try' 'unforgettably' 'veered' 'wardens'
 'willowy' 'yugoslavia']


32123

## SelectKBest for Dimensionality Reduction

In [15]:
feature_selector = SelectKBest(chi2, k=5000)

X_train_vect = feature_selector.fit_transform(X_train_vect_uni, y_train)
X_test_vect = feature_selector.transform(X_test_vect_uni)

selected_feature_indices = feature_selector.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_feature_indices]
print(selected_features[::300])
X_train_vect.shape

['aag', 'baffling', 'camera', 'considering', 'different', 'excuses', 'genre', 'hope', 'kelly', 'manny', 'needless', 'pinjar', 'remake', 'setup', 'sterling', 'tossed', 'wang']


(35000, 5000)

## Uni-gram feature importance on Logistic Regression

In [16]:
# Logistic regression for uni-gram
lgr = LogisticRegression()
lgr.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(lgr, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = lgr.predict(X_test_vect)

accuracy = lgr.score(X_test_vect, y_test)
report = classification_report(y_test, y_pred)
print("Classification report:\n", report)
print("Accuracy:", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 1: Accuracy = 0.8830
Fold 2: Accuracy = 0.8869
Fold 3: Accuracy = 0.8907
Fold 4: Accuracy = 0.8857
Fold 5: Accuracy = 0.8854
Variance of accuracy scores: 0.0000
Classification report:
               precision    recall  f1-score   support

         neg       0.89      0.88      0.88      7485
         pos       0.88      0.89      0.88      7515

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000

Accuracy: 0.884


## Uni-gram feature importance on Naive Bayes

In [17]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(nb_classifier, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = nb_classifier.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.8673
Fold 2: Accuracy = 0.8667
Fold 3: Accuracy = 0.8701
Fold 4: Accuracy = 0.8630
Fold 5: Accuracy = 0.8633
Variance of accuracy scores: 0.0000
Accuracy: 0.8504666666666667
Classification Report:
               precision    recall  f1-score   support

         neg       0.84      0.86      0.85      7485
         pos       0.86      0.84      0.85      7515

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



## Uni-gram feature importance with Random Forest

In [18]:
rfc = RandomForestClassifier()
rfc.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(rfc, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = rfc.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.8537
Fold 2: Accuracy = 0.8476
Fold 3: Accuracy = 0.8499
Fold 4: Accuracy = 0.8441
Fold 5: Accuracy = 0.8543
Variance of accuracy scores: 0.0000
Accuracy: 0.8478
Classification Report:
               precision    recall  f1-score   support

         neg       0.84      0.85      0.85      7485
         pos       0.85      0.84      0.85      7515

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



## Uni-gram feature importance with Decision Tree

In [19]:
d_tree = DecisionTreeClassifier(max_depth=10)
d_tree.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(d_tree, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = d_tree.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.7300
Fold 2: Accuracy = 0.7276
Fold 3: Accuracy = 0.7414
Fold 4: Accuracy = 0.7260
Fold 5: Accuracy = 0.7269
Variance of accuracy scores: 0.0000
Accuracy: 0.7287333333333333
Classification Report:
               precision    recall  f1-score   support

         neg       0.81      0.59      0.69      7485
         pos       0.68      0.86      0.76      7515

    accuracy                           0.73     15000
   macro avg       0.75      0.73      0.72     15000
weighted avg       0.75      0.73      0.72     15000



## Uni and Bi-gram

In [20]:
#Count Vectorizer with Uni and Bi-gram
Vect = CountVectorizer(min_df= 5, ngram_range = (1,2))
X_train_vect_mixed = Vect.fit_transform(X_train)
X_test_vect_mixed= Vect.transform(X_test)

feature_names = Vect.get_feature_names_out()

In [21]:
print(Vect.get_feature_names_out()[::3000])
len(Vect.get_feature_names_out())

['aa' 'alone woods' 'around someone' 'become instant' 'borne' 'cemented'
 'coldest' 'credited director' 'dialogue bit' 'early one' 'etc always'
 'fake looking' 'film rented' 'four kids' 'give enough' 'grotesque'
 'hollywood actor' 'instantly forgettable' 'killed man' 'licks'
 'looking around' 'man real' 'mirth' 'movies let' 'night hunter' 'objects'
 'oversee' 'pioneers' 'probes' 'really man' 'role comes' 'screen yet'
 'shah rukh' 'solar system' 'still us' 'synthesizer' 'thought watch'
 'truly magnificent' 'vanish' 'weak even' 'wormhole']


121651

# SelectKBest for Dimensionality Reduction

In [22]:
feature_selector = SelectKBest(chi2, k=70000)

X_train_vect = feature_selector.fit_transform(X_train_vect_mixed, y_train)
X_test_vect = feature_selector.transform(X_test_vect_mixed)

selected_feature_indices = feature_selector.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_feature_indices]
print(selected_features[::6000])
X_train_vect.shape

['aaa', 'better role', 'cover girl', 'ever since', 'gimmickry', 'james mcavoy', 'making character', 'not hollywood', 'predictable movie', 'see money', 'supposed budget', 'violent storm']


(35000, 70000)

## Uni and Bi-gram feature importance with Logistic Regression

In [23]:
lgr = LogisticRegression()
lgr.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(lgr, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = lgr.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 1: Accuracy = 0.9033
Fold 2: Accuracy = 0.9033
Fold 3: Accuracy = 0.9031
Fold 4: Accuracy = 0.8994
Fold 5: Accuracy = 0.8980
Variance of accuracy scores: 0.0000
Accuracy: 0.8980666666666667
Classification Report:
               precision    recall  f1-score   support

         neg       0.90      0.89      0.90      7485
         pos       0.89      0.90      0.90      7515

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



## Uni and Bi-gram feature importance with Naive Bayes

In [24]:
M_NB = MultinomialNB()
M_NB.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(M_NB, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = M_NB.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.9004
Fold 2: Accuracy = 0.9047
Fold 3: Accuracy = 0.9067
Fold 4: Accuracy = 0.8981
Fold 5: Accuracy = 0.9009
Variance of accuracy scores: 0.0000
Accuracy: 0.8748666666666667
Classification Report:
               precision    recall  f1-score   support

         neg       0.87      0.88      0.87      7485
         pos       0.88      0.87      0.87      7515

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



## Uni and Bi-gram feature importance with Random Forest

In [25]:
R_F = RandomForestClassifier()
R_F.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(R_F, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = R_F.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.8641
Fold 2: Accuracy = 0.8574
Fold 3: Accuracy = 0.8653
Fold 4: Accuracy = 0.8604
Fold 5: Accuracy = 0.8590
Variance of accuracy scores: 0.0000
Accuracy: 0.8618
Classification Report:
               precision    recall  f1-score   support

         neg       0.86      0.86      0.86      7485
         pos       0.86      0.86      0.86      7515

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



## Uni and Bi-gram feature importance with Decision Tree

In [26]:
d_tree = DecisionTreeClassifier(max_depth=10)
d_tree.fit(X_train_vect,y_train)

# cross-validation on training data
scores = cross_val_score(d_tree, X_train_vect, y_train, cv=5)

# Print the accuracy for each fold
for i, score in enumerate(scores):
    print(f"Fold {i+1}: Accuracy = {score:.4f}")

# Print the variance of the scores
variance = scores.var()
print(f"Variance of accuracy scores: {variance:.4f}")

y_pred = d_tree.predict(X_test_vect)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Fold 1: Accuracy = 0.7314
Fold 2: Accuracy = 0.7263
Fold 3: Accuracy = 0.7443
Fold 4: Accuracy = 0.7254
Fold 5: Accuracy = 0.7304
Variance of accuracy scores: 0.0000
Accuracy: 0.7322
Classification Report:
               precision    recall  f1-score   support

         neg       0.82      0.60      0.69      7485
         pos       0.68      0.87      0.76      7515

    accuracy                           0.73     15000
   macro avg       0.75      0.73      0.73     15000
weighted avg       0.75      0.73      0.73     15000

