In [None]:
import pandas as pd

df = pd.read_csv("../data/raw_data/creditcard.csv")
print(df.head())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import mlflow
import mlflow.sklearn


# MLflow setup

# MLflow setup (lokal tracking server)
# mlflow.set_tracking_uri("http://localhost:5000")  # dette er lokal tracking server deployment
# MLflow setup (Kubernetes deployment)
mlflow.set_tracking_uri("http://host.docker.internal:5000") # dette er kubernetes-deployment 
mlflow.set_experiment("CreditCard_Fraud_RF")

# mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000 # start serveren til docker


# docker build -t fraud-model # til start af lokal host
#docker run fraud-model

with mlflow.start_run():
    # 1. Indlæs data
    df = pd.read_csv("../data/raw_data/creditcard.csv")  # relative path til Docker
    X = df.drop("Class", axis=1)
    y = df["Class"]

    # 2. Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    # 3. Random Forest-model
    rf = RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    )
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)

    # 4. Eval
    auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
    print("AUC RF:", auc)

    # 5. MLflow logning
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_metric("roc_auc", auc)

    # Log model
    mlflow.sklearn.log_model(rf, "model")