In [1]:
import numpy as np
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sudhirjoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
mlflow.set_tracking_uri("http://13.60.79.0:5000")
mlflow.set_experiment("Exp 3 - TfIdf Bigram max_features")

2024/12/29 12:08:23 INFO mlflow.tracking.fluent: Experiment with name 'Exp 3 - TfIdf Bigram max_features' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://capstone-yt-mlflow-bucket/584579609332876679', creation_time=1735470503910, experiment_id='584579609332876679', last_update_time=1735470503910, lifecycle_stage='active', name='Exp 3 - TfIdf Bigram max_features', tags={}>

In [8]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [10]:
def run_experiment_tfidf_max_features(max_features):
    ngram_range = (1, 2) 
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])
    
    # Vectorization using TF-IDF with varying max_features
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"TFIDF_Bigrams_max_features_{max_features}")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Bigrams, max_features={max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(2, 2))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Bigrams, max_features={max_features}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_bigrams_{max_features}")

# Test various max_features values
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for max_features in max_features_values:
    run_experiment_tfidf_max_features(max_features)



🏃 View run TFIDF_Bigrams_max_features_1000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/8c42e71eaeeb41809fe0b2d65b651170
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_2000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/d993209c60ce476985075aa8f610b664
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_3000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/2200ecc6810c42ce8fb3e00890c0d334
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_4000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/7a93f722f3674591a35ae9674d782800
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_5000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/d03438296c224d889306e60b1a6b9c0a
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_6000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/23cf5bc29648448c999897ed222a90de
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_7000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/1183a733ab0d4821b550eab077b5efbc
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_8000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/bce5c37fcac8473193e3a41e78d69fad
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_9000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/20941453e4ff45109872f7f17b4ec51e
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679




🏃 View run TFIDF_Bigrams_max_features_10000 at: http://13.60.79.0:5000/#/experiments/584579609332876679/runs/77d78d3b89204eb1a9584dd6d6ec210b
🧪 View experiment at: http://13.60.79.0:5000/#/experiments/584579609332876679
