In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

# Train/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# --- Pipeline: SMOTE + Random Forest ---
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# --- Hyperparameter grid ---
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', 'log2']
}

# Start MLflow run
with mlflow.start_run(run_name="RandomForest_SMOTE_GridSearch"):

    # --- Grid Search med cross-validation ---
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        scoring='f1',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_val, y_train_val)

    # --- Bedste parametre og score ---
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("\n=== Bedste parametre fundet af GridSearchCV ===")
    print(best_params)
    print(f"Bedste F1-score på valideringsfolds: {best_score:.4f}")

    # Log hyperparametre
    mlflow.log_params(best_params)

    # Log bedste CV score
    mlflow.log_metric("best_cv_f1_score", best_score)

    # --- Endelig evaluering på test-sæt ---
    y_test_pred = grid_search.best_estimator_.predict(X_test)

    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print("\n=== Endelig evaluering på test-sæt ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_test_pred))

    # Log test metrics
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)

    # Log model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "random_forest_model")

    # Hvis du vil registrere modellen til model registry (hvis du kører mod en MLflow server med registry enabled)
    # mlflow.register_model("runs:/{}/random_forest_model".format(mlflow.active_run().info.run_id), "RandomForestCreditModel")

**registrer model**

In [10]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from tqdm.notebook import tqdm

# Load data
data_path = r"C:\Job_og_eksamensbevis\Github\projekter\RF_project\data\creditcard.csv"
df = pd.read_csv(data_path)

X = df.drop('Class', axis=1)
y = df['Class']

# Train/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# --- Pipeline: SMOTE + Random Forest ---
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# --- Hyperparameter grid ---
param_grid = {
    'rf__n_estimators': [100, 200],
    #'rf__max_depth': [None, 20],
    #'rf__min_samples_split': [2, 5],
    #'rf__min_samples_leaf': [1, 2],
    #'rf__max_features': ['sqrt', 'log2']
}

# Start MLflow run for tracking
with mlflow.start_run(run_name="RandomForest_GridSearch"):
    # Log parameter grid (som info)
    mlflow.log_param("param_grid", param_grid)

    # --- Grid Search med cross-validation ---
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        scoring='f1',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_val, y_train_val)

    # Log bedste parametre
    best_params = grid_search.best_params_
    mlflow.log_params(best_params)

    # Log bedste score på valideringsfolds
    best_score = grid_search.best_score_
    mlflow.log_metric("best_cv_f1_score", best_score)

    print("\n=== Bedste parametre fundet af GridSearchCV ===")
    print(best_params)
    print(f"Bedste F1-score på valideringsfolds: {best_score:.4f}")

    # --- Endelig evaluering på test-sæt ---
    y_test_pred = grid_search.best_estimator_.predict(X_test)

    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    # Log test metrics
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1_score", f1)

    print("\n=== Endelig evaluering på test-sæt ===")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_test_pred))

    # Log model i MLflow (best estimator)
    mlflow.sklearn.log_model(grid_search.best_estimator_, "random_forest_model")

    mlflow.log_artifact(cm_path)
    os.remove(cm_path)
    
    # Hvis du vil tilføje model registry (forudsætter MLflow server med registry)
    # model_uri = f"runs:/{mlflow.active_run().info.run_id}/random_forest_model"
    # mlflow.register_model(model_uri, "RandomForestCreditCardModel")


Fitting 5 folds for each of 2 candidates, totalling 10 fits


KeyboardInterrupt: 

In [8]:
mlflow.end_run()

In [None]:
# tilføj navn til den model jeg vil registrere (tilføj challenger eller champion i alias)
model_name = RF-SMOTE
run_id = input("Indtast run-id: ")
model_uri = f"runs:/{run_id}/random_forest_model"
result = mlflow.register_model(
    model_uri, model_name
)