In [1]:
import numpy as np
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
mlflow.set_tracking_uri("http://13.60.79.0:5000")
mlflow.set_experiment("Exp 5 - ML Algos with HP Tuning")

<Experiment: artifact_location='s3://capstone-yt-mlflow-bucket/542391321365143307', creation_time=1735485482149, experiment_id='542391321365143307', last_update_time=1735485482149, lifecycle_stage='active', name='Exp 5 - ML Algos with HP Tuning', tags={}>

In [3]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [7]:

# Define TF-IDF vectorization parameters
ngram_range = (1, 2)  # Bigram setting
max_features = 1000  # Set max_features to 1000 for TF-IDF

# Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
X_test_vec = vectorizer.transform(X_test)  # Transform test data

# Handle imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)

# Function to log model and metrics in MLFlow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model name and experiment type
        mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log model algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log detailed classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model itself
        mlflow.sklearn.log_model(model, f"{model_name}_model")


# Optuna objective function for Support Vector Machine (SVM)
def objective_svm(trial):
    # Suggest hyperparameters
    C = trial.suggest_loguniform("C", 1e-4, 1e2)
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    
    params = {
        'C': C,
        'kernel': kernel,
        'gamma': gamma,
    }
    
    # Only add degree parameter if kernel is polynomial
    if kernel == "poly":
        params['degree'] = trial.suggest_int("degree", 1, 5)

    # Create the model with suggested parameters
    model = SVC(**params)

    # Fit the model
    model.fit(X_train_vec, y_train)

    # Calculate accuracy on the test set
    accuracy = model.score(X_test_vec, y_test)
    return accuracy

# Function to log model and metrics in MLFlow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model name and experiment type
        mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log model algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log detailed classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model itself
        mlflow.sklearn.log_model(model, f"{model_name}_model")

# Run Optuna to optimize SVM hyperparameters
def run_optuna_experiment():
    # Create an Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_svm, n_trials=30)

    # Get the best hyperparameters and retrain the best model
    best_params = study.best_params
    
    # Create model parameters dictionary
    model_params = {
        'C': best_params['C'],
        'kernel': best_params['kernel'],
        'gamma': best_params['gamma'],
        'random_state': 42
    }
    
    # Only add degree parameter if kernel is polynomial
    if best_params['kernel'] == 'poly' and 'degree' in best_params:
        model_params['degree'] = best_params['degree']

    # Create the best model with the optimal parameters
    best_model = SVC(**model_params)

    # Log the best model with MLFlow
    log_mlflow("SVM", best_model, X_train_vec, X_test_vec, y_train, y_test)

# Run the Optuna experiment for SVM
run_optuna_experiment()

[I 2024-12-30 00:45:54,641] A new study created in memory with name: no-name-f64cde12-f86c-4295-aca5-aa2528dabe87
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2024-12-30 00:48:55,614] Trial 0 finished with value: 0.4378835401609164 and parameters: {'C': 0.010811416956509813, 'kernel': 'poly', 'gamma': 'scale', 'degree': 4}. Best is trial 0 with value: 0.4378835401609164.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2024-12-30 00:50:36,796] Trial 1 finished with value: 0.7804445656620755 and parameters: {'C': 1.0264700365675228, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 1 with value: 0.7804445656620755.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2024-12-30 01:11:31,640] Trial 2 finished with value: 0.5902086458475385 and parameters: {'C': 67.65233377818457, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3}. Best is trial 1 with value: 0.7804445656620755.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2024-12-30 01:14:19,291] Trial 3 finished with value: 0.43

🏃 View run SVM_ADASYN_TFIDF_Bigrams at: http://13.60.79.0:5000/#/experiments/542391321365143307/runs/e3fff0cc140b4e6683764af46ade372c
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/542391321365143307
