In [36]:
#! pip install mlflow boto3 awscli
#! pip install imbalanced-learn

In [37]:
import mlflow
# step 1 : Set up the mlflow tracking server 
mlflow.set_tracking_uri("http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/")

In [38]:
#Set or create an experiment
mlflow.set_experiment("Exp 4 - Handling Imbalanced Data")

<Experiment: artifact_location='s3://reddit-reccomender-bucket/10', creation_time=1763367088685, experiment_id='10', last_update_time=1763367088685, lifecycle_stage='active', name='Exp 4 - Handling Imbalanced Data', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [39]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import RUSBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os


In [40]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [41]:
#Step 0:train test split
X_train,X_test,y_train,y_test = train_test_split(
        df['clean_comment'],df['category'],
        test_size=0.2,
        random_state=42,
        stratify=df['category']
)

#step 1: Function to run the experiment
def run_imbalanced_experiment(imbalance_method,X_train_raw,X_test_raw,y_train,y_test):
    ngram_range = (1,3)
    max_features = 1000

    ##step 2 :Vectorization using BOW with fit on training data only
    vectorizer =CountVectorizer(ngram_range=ngram_range,max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train_raw)
    X_test_vec= vectorizer.transform(X_test_raw)

    # Step 3: Handle class imbalance based on the selected method (only applied to the training set)
    if imbalance_method == "class_weights":
        #use the class weight in Random forest
        class_weight = 'balanced'
    else:
        class_weight = None #Don not apply class weight if using resampling

        #resampling technique
        if imbalance_method == 'oversampling':
            smote = SMOTE (random_state=42)
            X_train_vec,y_train = smote.fit_resample(X_train_vec,y_train)
        elif imbalance_method == 'adasyn':
            adasyn = ADASYN(random_state=42)
            X_train_vec,y_train = adasyn.fit_resample(X_train_vec,y_train)
        elif imbalance_method == 'undersampling':
            rus = RandomUnderSampler(random_state=42)
            X_train_vec,y_train = rus.fit_resample(X_train_vec,y_train)
        elif imbalance_method == 'smote_enn':
            smote_enn = SMOTEENN(random_state=42)
            X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == "smote_tomek":
            smote_tomek = SMOTETomek(random_state=42)
            X_train_vec, y_train = smote_tomek.fit_resample(X_train_vec, y_train)

        

    # Step 4: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"Imbalance_{imbalance_method}_RandomForest_BOW_Trigram")
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with BOW Trigrams, imbalance handling method={imbalance_method}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "BOW")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("imbalance_method",imbalance_method)

        #initialize and train method
        model =RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42,class_weight=class_weight)
        model.fit(X_train_vec, y_train)
            
        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test_vec)

        #log accuracy
        accuracy = accuracy_score(y_test,y_pred)
        mlflow.log_metric("accuracy",accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                        mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test,y_pred)
        plt.figure(figsize=(8,6))
        sns.heatmap(conf_matrix,annot=True,fmt='d',cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: BOW Trigrams, Imbalance={imbalance_method}")
        filename = f"confusion_matrix/conf_matrix_{imbalance_method}.png"
        plt.savefig(filename)
        mlflow.log_artifact(filename)
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_BOW_trigrams_{imbalance_method}")

# Step 7: Run experiments for different imbalance methods
imbalance_methods = ['class_weights', 'oversampling', 'adasyn', 'undersampling', 'smote_enn','smote_tomek']

for method in imbalance_methods:
    run_imbalanced_experiment(method,X_train,X_test,y_train,y_test)






üèÉ View run Imbalance_class_weights_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/aa13fad6bbb44c759edd2a21af6dc620
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10




üèÉ View run Imbalance_oversampling_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/67cdc1fbfd52453a82ba5a31048af97d
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10




üèÉ View run Imbalance_adasyn_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/8e6a835addd246b3a4b684795b587a9e
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10




üèÉ View run Imbalance_undersampling_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/3c9a865bbe064b648495d7589b18b7e3
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10




üèÉ View run Imbalance_smote_enn_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/fbe11e8844d54dac942fd0dc1f11517d
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10




üèÉ View run Imbalance_smote_tomek_RandomForest_BOW_Trigram at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10/runs/a4ebe373688446aebac9807718046def
üß™ View experiment at: http://ec2-51-21-223-34.eu-north-1.compute.amazonaws.com:5000/#/experiments/10
