Importing Libraries

In [14]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [15]:
# Define paths
positive = 'train/pos'
negative = 'train/neg'

X_text = []
y = []

# Load positive reviews
for filename in os.listdir(positive):
    filepath = os.path.join(positive, filename)
    with open(filepath, encoding='utf-8') as file:
        X_text.append(file.read())
        y.append(1)

# Load negative reviews
for filename in os.listdir(negative):
    filepath = os.path.join(negative, filename)
    with open(filepath, encoding='utf-8') as file:
        X_text.append(file.read())
        y.append(0)

print(f"Loaded {len(X_text)} reviews.")


Loaded 25000 reviews.


Vectorize the text into TF-IDF features

In [16]:
# Vectorize raw text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X_text)

print(f"TF-IDF matrix shape: {X_vec.shape}")


TF-IDF matrix shape: (25000, 5000)


In [17]:
#split the data into training and validation sets 80/20
X_train, X_val, y_train_split, y_val_split = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


Training set shape: (20000, 5000)
Validation set shape: (5000, 5000)


In [18]:
#Define Feature Transformations
# Apply PCA to reduce TF-IDF dimensions
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train.toarray())
X_val_pca = pca.transform(X_val.toarray())

print(f"PCA-reduced train shape: {X_train_pca.shape}")
print(f"PCA-reduced val shape: {X_val_pca.shape}")

PCA-reduced train shape: (20000, 100)
PCA-reduced val shape: (5000, 100)


In [19]:
# Expand PCA features with Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_pca)
X_val_poly = poly.transform(X_val_pca)

print(f"Polynomial-expanded train shape: {X_train_poly.shape}")
print(f"Polynomial-expanded val shape: {X_val_poly.shape}")


Polynomial-expanded train shape: (20000, 5051)
Polynomial-expanded val shape: (5000, 5051)


In [20]:
#FOR DISPLAYING GRAPH FOR EACH MODEL LATER ON
def plot_performance(df_results, param_name, model_name, transformation_name):
    plt.plot(df_results[param_name], df_results['Validation Accuracy'], marker='o')
    plt.xscale('log')
    plt.xlabel(param_name)
    plt.ylabel('Validation Accuracy')
    plt.title(f'{model_name} ({transformation_name})')
    plt.grid(True)
    plt.show()


LOGISTIC REGRESSION WITH 3 TRANSFORMATIONS