In [None]:
import numpy as np
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
mlflow.set_tracking_uri("http://13.60.79.0:5000")
mlflow.set_experiment("Exp 5 - ML Algos with HP Tuning")

In [None]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

In [None]:

# Define TF-IDF vectorization parameters
ngram_range = (1, 2)  # Bigram setting
max_features = 1000  # Set max_features to 1000 for TF-IDF

# Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
X_test_vec = vectorizer.transform(X_test)  # Transform test data

# Handle imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)

# Function to log model and metrics in MLFlow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model name and experiment type
        mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log model algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log detailed classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model itself
        mlflow.sklearn.log_model(model, f"{model_name}_model")

# Optuna objective function for KNN
def objective_knn(trial):
    # Suggest hyperparameters for KNN
    n_neighbors = trial.suggest_int("n_neighbors", 1, 20)  # Number of neighbors
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])  # Weight function
    algorithm = trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])  # Algorithm used for nearest neighbors search
    leaf_size = trial.suggest_int("leaf_size", 10, 50)  # Leaf size for tree-based algorithms

    # Initialize the KNN model
    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm,
        leaf_size=leaf_size
    )

    # Train and evaluate the model
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Function to log model and metrics in MLFlow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model name and experiment type
        mlflow.set_tag("mlflow.runName", f"{model_name}_ADASYN_TFIDF_Bigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log model algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log detailed classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model itself
        mlflow.sklearn.log_model(model, f"{model_name}_model")

# Run Optuna to optimize KNN hyperparameters
def run_optuna_experiment():
    # Create an Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_knn, n_trials=30)

    # Get the best hyperparameters and retrain the best model
    best_params = study.best_params
    best_model = KNeighborsClassifier(
        n_neighbors=best_params['n_neighbors'],
        weights=best_params['weights'],
        algorithm=best_params['algorithm'],
        leaf_size=best_params['leaf_size']
    )

    # Log the best model with MLFlow
    log_mlflow("KNN", best_model, X_train_vec, X_test_vec, y_train, y_test)

# Run the Optuna experiment for KNN
run_optuna_experiment()