In [6]:
import numpy as np
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
mlflow.set_tracking_uri("http://13.60.79.0:5000")
mlflow.set_experiment("Exp 5 - ML Algos with HP Tuning")

<Experiment: artifact_location='s3://capstone-yt-mlflow-bucket/542391321365143307', creation_time=1735485482149, experiment_id='542391321365143307', last_update_time=1735485482149, lifecycle_stage='active', name='Exp 5 - ML Algos with HP Tuning', tags={}>

In [8]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [11]:

# Define TF-IDF vectorization parameters
ngram_range = (1, 2)  # Bigram setting
max_features = 1000  # Set max_features to 1000 for TF-IDF

# Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
X_test_vec = vectorizer.transform(X_test)  # Transform test data

# Handle imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)

# Function to log model and metrics in MLFlow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model name and experiment type
        mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log model algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log detailed classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model itself
        mlflow.sklearn.log_model(model, f"{model_name}_model")

# Optuna objective function for Logistic Regression
def objective_logistic_regression(trial):
    # Suggest hyperparameters
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet", None])
    C = trial.suggest_loguniform("C", 1e-4, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0) if penalty == "elasticnet" else None

    # Set solver based on penalty
    solver = "saga" if penalty in ["l1", "elasticnet"] else "lbfgs"

    # Initialize LogisticRegression model
    model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        l1_ratio=l1_ratio,
        random_state=42,
        max_iter=1000
    )

    # Train and evaluate the model
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Run Optuna to optimize Logistic Regression hyperparameters
def run_optuna_experiment():
    # Create an Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_logistic_regression, n_trials=30)

    # Get the best hyperparameters and retrain the best model
    best_params = study.best_params
    best_model = LogisticRegression(
        penalty=best_params['penalty'],
        C=best_params['C'],
        solver="saga" if best_params['penalty'] in ["l1", "elasticnet"] else "lbfgs",
        l1_ratio=best_params.get('l1_ratio'),  # Only include if 'elasticnet'
        random_state=42,
        max_iter=1000
    )

    # Log the best model with MLFlow
    log_mlflow("LogisticRegression", best_model, X_train_vec, X_test_vec, y_train, y_test)

# Run the Optuna experiment for Logistic Regression
run_optuna_experiment()

[I 2024-12-29 18:36:02,960] A new study created in memory with name: no-name-5c4ecf0d-9d16-4ac2-b8d0-c9981756e0e3
  C = trial.suggest_loguniform("C", 1e-4, 10.0)
[I 2024-12-29 18:36:03,869] Trial 0 finished with value: 0.7758079912723306 and parameters: {'penalty': None, 'C': 0.005480838304888443}. Best is trial 0 with value: 0.7758079912723306.
  C = trial.suggest_loguniform("C", 1e-4, 10.0)
[I 2024-12-29 18:36:04,277] Trial 1 finished with value: 0.7288967680349107 and parameters: {'penalty': 'l1', 'C': 0.05665568045749267}. Best is trial 0 with value: 0.7758079912723306.
  C = trial.suggest_loguniform("C", 1e-4, 10.0)
[W 2024-12-29 18:36:30,869] Trial 2 failed with parameters: {'penalty': 'elasticnet', 'C': 3.7265206692884294, 'l1_ratio': 0.7358566814935423} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  

KeyboardInterrupt: 