In [1]:
import os
import tarfile
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Define the path to the tar.gz file
file_path = '/content/aclImdb_v1.tar.gz'
extract_path = '/content/aclImdb'

# Check if the file exists and then extract it
if os.path.exists(file_path):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=extract_path)
    print(f"Dataset extracted to {extract_path}")
else:
    print("File not found.")

Dataset extracted to /content/aclImdb


In [27]:
def load_reviews_from_dir(directory, sentiment):
    reviews = []
    for filepath in glob.glob(os.path.join(directory, '*.txt')):
        with open(filepath, 'r', encoding='utf-8') as f:
            reviews.append((f.read(), sentiment))
    return reviews

train_pos = load_reviews_from_dir('/content/aclImdb/aclImdb/train/pos', 1)
train_neg = load_reviews_from_dir('/content/aclImdb/aclImdb/train/neg', 0)
train_data = train_pos + train_neg
train_dataset = pd.DataFrame(train_data, columns=['review', 'sentiment'])

test_pos = load_reviews_from_dir('/content/aclImdb/aclImdb/test/pos', 1)
test_neg = load_reviews_from_dir('/content/aclImdb/aclImdb/test/neg', 0)
test_data = test_pos + test_neg
test_dataset = pd.DataFrame(test_data, columns=['review', 'sentiment'])

In [4]:
train_dataset.to_csv('train_data.csv', index=False)
test_dataset.to_csv('test_data.csv', index=False)

In [5]:
train_dataset.head()

Unnamed: 0,review,sentiment
0,It was originally meant to be a film that Gene...,1
1,I happen to run into this movie one night so I...,1
2,One of the best true-crime movies ever made an...,1
3,I am shocked. Shocked and dismayed that the 42...,1
4,This is an excellent film about the characters...,1


In [6]:
test_dataset.head()

Unnamed: 0,review,sentiment
0,It was originally meant to be a film that Gene...,1
1,I happen to run into this movie one night so I...,1
2,One of the best true-crime movies ever made an...,1
3,I am shocked. Shocked and dismayed that the 42...,1
4,This is an excellent film about the characters...,1


In [7]:
X = train_dataset['review']
y = train_dataset['sentiment']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

In [8]:
# Model 1 - Naive Bayes
classifier_NB = MultinomialNB()
classifier_NB.fit(X_train, y_train)
y_pred = classifier_NB.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1202
           1       0.90      0.87      0.89      1298

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500

[[1082  120]
 [ 170 1128]]


In [9]:
# Model 2 - Linear SVM
classifier_SVM = LinearSVC()
classifier_SVM.fit(X_train, y_train)
y_pred = classifier_SVM.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      1202
           1       0.91      0.90      0.90      1298

    accuracy                           0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

[[1083  119]
 [ 131 1167]]




In [10]:
# Model 3 - Decision Tree
classifier_DT = DecisionTreeClassifier(criterion = 'gini', random_state = 42,max_depth=5)
classifier_DT.fit(X_train, y_train)

y_pred = classifier_DT.predict(X_val)
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.83      0.46      0.59      1202
           1       0.64      0.91      0.76      1298

    accuracy                           0.69      2500
   macro avg       0.74      0.68      0.67      2500
weighted avg       0.73      0.69      0.68      2500

[[ 551  651]
 [ 116 1182]]


In [11]:
# Model 4 - Logistic Regression
classifier_LR = LogisticRegression(random_state=42).fit(X_train,y_train)
y_pred = classifier_LR.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1202
           1       0.91      0.90      0.91      1298

    accuracy                           0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500

[[1083  119]
 [ 124 1174]]


In [12]:
# Model 5 - KNN
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

y_pred = classifier_knn.predict(X_val)
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.58
              precision    recall  f1-score   support

           0       0.57      0.57      0.57      1202
           1       0.60      0.59      0.60      1298

    accuracy                           0.58      2500
   macro avg       0.58      0.58      0.58      2500
weighted avg       0.59      0.58      0.58      2500

[[691 511]
 [527 771]]


In [34]:
X = test_dataset['review']
y = test_dataset['sentiment']

X_test = vectorizer.transform(X)
y_test_pred = classifier_SVM.predict(X_test)
accuracy = accuracy_score(y, y_test_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y, y_test_pred))
print(confusion_matrix(y, y_test_pred))

Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     12500
           1       0.89      0.90      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000

[[11149  1351]
 [ 1293 11207]]


In [35]:
X = test_dataset['review']
y = test_dataset['sentiment']

X_test = vectorizer.transform(X)
y_test_pred = classifier_LR.predict(X_test)
accuracy = accuracy_score(y, y_test_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y, y_test_pred))
print(confusion_matrix(y, y_test_pred))

Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.90      0.89      0.90     12500
           1       0.89      0.90      0.90     12500

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000

[[11167  1333]
 [ 1269 11231]]
