In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import itertools
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Set up MLflow tracking
mlflow.set_tracking_uri("file:./mlruns")

# MLflow Experiment erstellen
experiment_name = "income_prediction"
experiment = mlflow.set_experiment(experiment_name)

# Experiment-Details ausgeben
print(f"Experiment Name: {experiment_name}")
print(f"Experiment ID: {experiment.experiment_id}")

2025/02/27 19:09:43 INFO mlflow.tracking.fluent: Experiment with name 'income_prediction' does not exist. Creating a new experiment.


Experiment Name: income_prediction
Experiment ID: 132791688657161062


In [3]:
data = pd.read_csv('../data/processed/processed_data.csv')
X = data.drop(columns=['income >50K'])
y = data['income >50K']

In [4]:
# Numerische und kategoriale Spalten identifizieren
num_features = ['age', 'educational-num', 'hours-per-week']
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship']

# Transformationen für numerische Spalten (Skalierung)
num_transformer = StandardScaler()

# Transformationen für kategoriale Spalten (One-Hot-Encoding)
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing-Pipeline
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [5]:
# Hyperparameter-Sets definieren
param_grid = {
    "n_estimators": [100, 200],  # Anzahl Bäume
    "max_depth": [5, 10],        # Tiefe des Baums
    "min_samples_split": [2, 5], # Min. Samples für Split
    "min_samples_leaf": [1, 3]   # Min. Samples pro Blatt
}

# Alle Kombinationen aus Parametern generieren
param_combinations = list(itertools.product(*param_grid.values()))

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models with different hyperparameters
for params in param_combinations:
    # Dictionary mit aktuellen Parametern erstellen
    current_params = dict(zip(param_grid.keys(), params))

    with mlflow.start_run():
        print(f"Training mit Parametern: {current_params}")

        # Random Forest mit aktuellen Parametern
        rf_model = RandomForestClassifier(**current_params, random_state=42)

        # Pipeline mit Modell erstellen
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', rf_model)
        ])

        # Modell trainieren
        pipeline.fit(X_train, y_train)

        # Vorhersagen & Accuracy berechnen
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Alle Parameter loggen
        mlflow.log_params(current_params)

        # Metriken loggen
        mlflow.log_metric("accuracy", accuracy)

        # Modell speichern
        mlflow.sklearn.log_model(pipeline, "random_forest_pipeline")

# Get the best model based on accuracy
metric_name = "accuracy"  # Corrected the typo from "accuray" to "accuracy"

# Get the experiment
experiment = mlflow.get_experiment_by_name(experiment_name)

# Get all runs and find the best one
df = mlflow.search_runs(experiment_ids=[experiment.experiment_id], order_by=[f"metrics.{metric_name} DESC"])

Training mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}




Training mit Parametern: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}




In [None]:
# Check if we have any runs
if len(df) > 0:
    best_run_id = df.iloc[0]["run_id"]
    best_accuracy = df.iloc[0][f"metrics.{metric_name}"]
    
    print(f"Best Run ID: {best_run_id}")
    print(f"Best Accuracy: {best_accuracy:.4f}")
    
    # Load the best model using the run ID
    model_uri = f"runs:/{best_run_id}/random_forest_pipeline"
    best_model = mlflow.pyfunc.load_model(model_uri)
    
    # Example of serving the model with new data
    print("\nModel ready to serve predictions")
    
    # Optional: You can test the model on some example data
    sample_data = X_test.iloc[:5]  # Take a few samples from test set
    predictions = best_model.predict(sample_data)
    print(f"Sample predictions: {predictions}")
    
    # You could also save the best model to a specific location
    mlflow.sklearn.save_model(
        sk_model=mlflow.sklearn.load_model(model_uri),
        path="best_model"
    )
    print("Best model saved to 'best_model' directory")
else:
    print("No runs found in the experiment.")

# To serve the model with MLflow:
# Run the following command in your terminal:
# mlflow models serve -m runs:/{best_model}/random_forest_pipeline -p 5000
# mlflow models serve -m model:/Income_Predictor/random_forest_pipeline -p 5000

Best Run ID: d1f056d8ff734fee97242f77df1f7c1f
Best Accuracy: 0.8363

Model ready to serve predictions
Sample predictions: [0 0 0 0 0]


MlflowException: Path 'best_model' already exists and is not empty

In [None]:
# Send request to Mlflow Server using dataframe

import requests
import json

url = 'http://localhost:5000/invocations'
headers = {"Content-Type": "application/json"}

payload = json.dumps({"dataframe_split": data.to_dict(orient="split")})

response = requests.post(url, data=payload, headers=headers)
print(response.status_code, response.text)

200 {"predictions": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [66]:
# Define using pydantic class
from pydantic import BaseModel, Field

class Person(BaseModel):
    age: int
    workclass: str
    educational_num: int = Field(alias="educational-num")
    marital_status: str = Field(alias="marital-status")
    occupation: str
    relationship: str
    hours_per_week: int = Field(alias="hours-per-week")
    is_Male: int  # Maintaining exact case from DataFrame
    income_over_50K: int = Field(alias="income >50K")
    is_White: int  # Maintaining exact case from DataFrame
    from_USA: int  # Maintaining exact case from DataFrame
    gained_capital: int = Field(alias="gained-capital")
    
    class Config:
        populate_by_name = True

# Create an instance using keyword arguments
John = Person(
    age=25,
    workclass="Private",
    educational_num=7,
    marital_status="Never-married",
    occupation="Simple Services",
    relationship="Child",
    hours_per_week=40,
    is_Male=1,
    income_over_50K=0,
    is_White=0,
    from_USA=1,
    gained_capital=0
)

In [63]:
John.model_dump()

{'age': 25,
 'workclass': 'Private',
 'educational_num': 7,
 'marital_status': 'Never-married',
 'occupation': 'Simple Services',
 'relationship': 'Child',
 'hours_per_week': 40,
 'is_male': 1,
 'income_over_50K': 0,
 'is_white': 0,
 'from_usa': 1,
 'gained_capital': 0}

In [None]:
# Call api by using pydantic class
import requests
import json

# Manually create the "split" format dictionary
model_dict = John.model_dump(by_alias=True)
split_format = {
    "columns": list(model_dict.keys()),
    "data": [list(model_dict.values())],
    "index": [0]
}

url = 'http://localhost:5000/invocations'
headers = {"Content-Type": "application/json"}

# Use the manually created split format dictionary as the payload
payload = json.dumps({"dataframe_split": split_format})

response = requests.post(url, data=payload, headers=headers)
print(response.status_code, response.text)

200 {"predictions": [0]}
