In [5]:
import numpy as np
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
mlflow.set_tracking_uri("http://13.60.79.0:5000")
mlflow.set_experiment("Exp 2 - BoW vs TfIdf")

<Experiment: artifact_location='s3://capstone-yt-mlflow-bucket/117101972101598833', creation_time=1735469235239, experiment_id='117101972101598833', last_update_time=1735469235239, lifecycle_stage='active', name='Exp 2 - BoW vs TfIdf', tags={}>

In [7]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [8]:
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])
    
    # Vectorization
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    # Transform the data into feature vectors
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_{vectorizer_name}_{ngram_range}")

# Run experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # Example max feature size

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")




🏃 View run BoW_(1, 1)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/0f43ae5c130144d3a3df185b2c2bc036
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833




🏃 View run TF-IDF_(1, 1)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/b5d51509951d42b2bed14cc12004ee23
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833




🏃 View run BoW_(1, 2)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/ab3534ac00bf49ee955f1e9c2ec700e3
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833




🏃 View run TF-IDF_(1, 2)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/ccfff6f7a2904854ba39bc8292bcf498
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833




🏃 View run BoW_(1, 3)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/34293e2ed39e49e4bcfb9ab23e4b938d
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833




🏃 View run TF-IDF_(1, 3)_RandomForest at: http://13.60.79.0:5000/#/experiments/117101972101598833/runs/69825ab05ed149609be52d8b02dc763a
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/117101972101598833
