In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.preprocessing import FunctionTransformer
from dotenv import load_dotenv
import os
load_dotenv()

df_raw = pd.read_csv("/Users/martinper/Downloads/fraud_train.csv")

In [None]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
EXPERIMENT_NAME = os.getenv("MLFLOW_EXPERIMENT_NAME")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
AWS_ARTIFACT_PATH = os.getenv("AWS_ARTIFACT_PATH")
AWS_FULL_ARTIFACT_PATH = "s3://" + AWS_BUCKET_NAME + "/" + AWS_ARTIFACT_PATH

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if experiment is None:
    # set artifact location explicitly - couldn't get it to work otherwise
    exp_id = mlflow.create_experiment(
        name=EXPERIMENT_NAME,
        artifact_location=AWS_FULL_ARTIFACT_PATH
    )
    print(f"New experiment: {EXPERIMENT_NAME}")
    print(f"   ➜ ID: {exp_id}")
    print(f"   ➜ Artifact location: {AWS_FULL_ARTIFACT_PATH}")
else:
    exp_id = experiment.experiment_id
    print(f"Existing experiment: {EXPERIMENT_NAME}")
    print(f"   ➜ ID: {exp_id}")
    print(f"   ➜ Artifact location: {experiment.artifact_location}")

Existing experiment: fraud_detection_exp
   ➜ ID: 9
   ➜ Artifact location: s3://jedhaparis/fraud_detection_artifacts/


In [None]:
df = df_raw.head(10000)

# separate X/y
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

def feature_engineering(df):
    df = df.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num'])
    df = df.drop(columns=['job', 'city', 'merchant'])  # todo try target encode

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['month'] = df['trans_date_trans_time'].dt.month
    df['weekday'] = df['trans_date_trans_time'].dt.weekday
    df = df.drop(columns=['dob', 'trans_date_trans_time'])
    return df

feature_transformer = FunctionTransformer(feature_engineering)


# separate train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply on X_train to identify the final columns (for one-hot encode)
X_train_transformed = feature_transformer.fit_transform(X_train)
cat_cols = X_train_transformed.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    remainder='passthrough'
)


# Complete pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', feature_transformer),
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])


# Train and log through MLflow
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("n_estimators", 100)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    mlflow.sklearn.log_model(pipeline, "fraud_pipeline", signature=infer_signature(X_test, pipeline.predict(X_test)))

    print(f"✅ Run logged — acc={acc:.4f}, prec={prec:.4f}, recall={rec:.4f}, f1={f1:.4f}")



✅ Run logged — acc=0.9985, prec=1.0000, recall=0.4000, f1=0.5714
🏃 View run trusting-stoat-196 at: http://127.0.0.1:5000/#/experiments/9/runs/4db5f532e7004d36a5409005bcc6ece5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/9
