Modeling 
This notebook contains our machine learning algorithm (Naive Bayes: partially implemented, logistic regression: not implemented yet, ...). It builds on the preprocessing steps (bag-of-words and tf-idf). 

In [None]:
# Standard libraries
import os
import re
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Text processing
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

# Machine learning
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import hstack


Naive Bayes algorithm

In [None]:
# --- File Paths Setup ---
# Base directory (go 2 levels up from /src1/)
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define input/output directories using relative paths
input_dir = os.path.join(base_dir, "data", "intermediate")
output_dir = os.path.join(base_dir, "data", "intermediate")

In [None]:
# Naive Bayes
#  Suppress warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function for Text Preprocessing (Lemmatization & Cleaning)


def prepare_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenization
    lemmatized_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
    ]  # Apply lemmatization
    return " ".join(lemmatized_tokens)  # Join back into a string


# Load dataset
data_file = os.path.join(base_dir, "data", "raw", "reviews_2010.json")
df = pd.read_json(data_file, lines=True)

# Apply preprocessing to text column
df["text"] = df["text"].apply(prepare_text)

# Define Multi-Class Sentiment Labels
df["sentiment"] = df["stars"].apply(
    lambda x: 0 if x <= 2 else (1 if x == 3 else (2 if x == 4 else 3))
)

# Enhanced TF-IDF representation with expanded stopwords
custom_stopwords = list(
    set(ENGLISH_STOP_WORDS).union(
        {"great", "good", "bad", "nice", "product", "service"}
    )
)
vectorizer = TfidfVectorizer(
    max_features=150000,
    sublinear_tf=True,
    max_df=0.7,
    min_df=3,
    ngram_range=(1, 2),
    stop_words=custom_stopwords,
    norm="l2",
)
X_tfidf = vectorizer.fit_transform(df["text"])

# Feature Selection (Optional - Can Be Disabled)
USE_CHI2_SELECTION = False  # Set to True if you want to enable feature selection

if USE_CHI2_SELECTION:
    chi2_selector = SelectKBest(chi2, k=min(25000, X_tfidf.shape[1]))
    X_tfidf = chi2_selector.fit_transform(X_tfidf, df["sentiment"])

# Feature Engineering - Optimized Selection
df["review_length"] = df["text"].apply(lambda x: len(x.split()))
df["num_exclamation"] = df["text"].apply(lambda x: x.count("!"))
df["avg_word_length"] = df["text"].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0
)

# Convert additional features to a numpy array
X_additional = np.array(df[["review_length", "num_exclamation", "avg_word_length"]])

# Combine TF-IDF with additional features
X_final = hstack((X_tfidf, X_additional))
y = df["sentiment"].values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, stratify=y, random_state=42
)

# Optimize Naive Bayes Model with Expanded Hyperparameter Tuning
param_grid = {"alpha": np.linspace(0.001, 2.0, 20)}  # Expanded range for better tuning
grid_search = GridSearchCV(
    ComplementNB(), param_grid, cv=10, scoring="accuracy", n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_["alpha"]

# Train Optimized Naive Bayes Model
nb_classifier = ComplementNB(alpha=best_alpha)
nb_classifier.fit(X_train, y_train)

# Make Predictions
y_pred = nb_classifier.predict(X_test)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"✅  Accuracy: {accuracy:.4f}")
print(
    classification_report(
        y_test,
        y_pred,
        target_names=["Negative", "Neutral", "Positive", "Very Positive"],
    )
)

# Perform Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(nb_classifier, X_final, y, cv=skf, n_jobs=-1)
print(f" Stratified Cross-validation accuracy: {cv_scores.mean():.4f}")

Multinomial Logistic Regression algorithm

In [None]:
# Load dataset
file_path = os.path.join(input_dir, "bow_vector.csv")
df = pd.read_csv(file_path)

# Assume the first column is the text reviews (not used in ML model) and the second column is "sentiment label"
X = df.iloc[:, 2:]  # Features (numerical representation of reviews)
y = df.iloc[:, 1]  # Target (sentiment label)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Multinomial Logistic Regression
model = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=500)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))