In [1]:
import os
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
data = pd.read_csv("/Users/maxkucher/preprocessing/mlops/mlflow/titanic (1).csv")
data = data[["Survived", "Pclass", "Age", "Fare"]]
data = data.dropna(subset=["Age"])

In [3]:
data

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.2500
1,1,1,38.0,71.2833
2,1,3,26.0,7.9250
3,1,1,35.0,53.1000
4,0,3,35.0,8.0500
...,...,...,...,...
885,0,3,39.0,29.1250
886,0,2,27.0,13.0000
887,1,1,19.0,30.0000
889,1,1,26.0,30.0000


In [None]:
# mlflow server \
#   --backend-store-uri "/Users/maxkucher/Desktop/server/data_local" \
#   --default-artifact-root "/Users/maxkucher/Desktop/server/artefacts" \
#   --port 5000

In [4]:
os.environ["USER"] = "Maks Kucher"

In [5]:
train, test = train_test_split(data, test_size=0.2)

In [6]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Random_Forest_test")

2025/08/26 20:36:58 INFO mlflow.tracking.fluent: Experiment with name 'Random_Forest_test' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/maxkucher/Desktop/server/artefacts/309967802625230156', creation_time=1756229818127, experiment_id='309967802625230156', last_update_time=1756229818127, lifecycle_stage='active', name='Random_Forest_test', tags={}>

In [7]:
import mlflow.sklearn

def experiment(run_name, max_depth, min_samples_split):
    with mlflow.start_run(run_name=run_name):
        
        mlflow.log_param("model_name", "RandomForestClassifier")
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)

        model = RandomForestClassifier(max_depth=max_depth, min_samples_split=min_samples_split)

        x_train = train.drop(["Survived"], axis="columns")
        y_train = train["Survived"]
        model.fit(x_train, y_train)

        x_test = test.drop(["Survived"], axis="columns")
        y_test = test["Survived"]

        preds = model.predict(x_test)

        acc = accuracy_score(y_test, preds)
        cm = confusion_matrix(y_test, preds)

        mlflow.log_metric("accuracy", acc)

        mlflow.log_metric("tp", cm[0][0])
        mlflow.log_metric("fp", cm[0][1])
        mlflow.log_metric("tn", cm[1][0])
        mlflow.log_metric("fn", cm[1][1])

        mlflow.sklearn.log_model(model, artifact_path="model")


In [8]:
max_depths = [10, 12, 14, 16]
min_samples_splits = [10, 15, 20, 25]

for i, (max_depth, min_samples_split) in enumerate(zip(max_depths, min_samples_splits)):
    experiment(f"run_{i+1}", max_depth, min_samples_split)

