In [1]:
import mlflow
import mlflow.sklearn

In [2]:
mlflow.set_tracking_uri("http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/")

In [3]:
mlflow.set_experiment("BoW v/s TFIdf")

<Experiment: artifact_location='s3://my-mlflow-bucket-123/734492199182626374', creation_time=1755371081657, experiment_id='734492199182626374', last_update_time=1755371081657, lifecycle_stage='active', name='BoW v/s TFIdf', tags={}>

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("dataset.csv")

In [6]:
df.shape

(39574, 5)

In [7]:
df.head()

Unnamed: 0,text,label,count,num_stop,chars
0,grew b watching loving thunderbird mate school...,0,151,64,874
1,put movie dvd player sat coke chip expectation...,0,324,166,1791
2,people not know particular time past like feel...,0,184,87,983
3,even though great interest biblical movie bore...,0,69,36,351
4,im die hard dad army fan nothing ever change g...,1,173,71,953


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)

    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RF")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RFClassifier")

        mlflow.set_tag("description", f"RF with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}")

        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        conf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(conf_matrix, annot=True)
        plt.savefig("confusion_matrix_experiment.png")
        mlflow.log_artifact("confusion_matrix_experiment.png")
        plt.close()

        mlflow.sklearn.log_model(model, f"rf_{vectorizer_name}_{ngram_range}")

ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = 5000

for ngram_range in ngram_ranges:
    run_experiment("BoW", ngram_range=ngram_range, vectorizer_max_features=max_features, vectorizer_name="BoW")

    run_experiment("TF-IDF", ngram_range=ngram_range, vectorizer_max_features=max_features, vectorizer_name="TF-IDF")



🏃 View run BoW_(1, 1)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/6aa44ce9ea5b481abf473f1fc48284d0
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374




🏃 View run TF-IDF_(1, 1)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/5e73914d4806434796246bc0f766a56b
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374




🏃 View run BoW_(1, 2)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/b473170181134aa5bf91de75b3d6a2b0
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374




🏃 View run TF-IDF_(1, 2)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/6d32dcf94c5c4f7c9889e9c4eca2b0aa
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374




🏃 View run BoW_(1, 3)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/c4a7917c9c394ffd818eefc533c35d26
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374




🏃 View run TF-IDF_(1, 3)_RF at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374/runs/d2a5dbe19e3341a58ecabcc7dbb85c05
🧪 View experiment at: http://ec2-13-61-146-35.eu-north-1.compute.amazonaws.com:5000/#/experiments/734492199182626374
