In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fraud-detection/sample_submission.csv
/kaggle/input/fraud-detection/test_identity.csv
/kaggle/input/fraud-detection/train_identity.csv
/kaggle/input/fraud-detection/test_transaction.csv
/kaggle/input/fraud-detection/train_transaction.csv


In [2]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
Identity = pd.read_csv('/kaggle/input/fraud-detection/train_identity.csv')
Transaction = pd.read_csv('/kaggle/input/fraud-detection/train_transaction.csv')

In [4]:
df = Transaction.merge(Identity, how='left', left_on='TransactionID', right_on='TransactionID')

In [6]:
from sklearn.model_selection import GroupShuffleSplit

X = df.drop(columns=['isFraud'])
y = df['isFraud']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)

split = gss.split(X, y, groups=X['card1'].astype(str) + "_" + X['addr1'].astype(str))

train_idx, test_idx = next(split)

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [7]:
from sklearn.model_selection import train_test_split

X = Transaction.drop(columns=['isFraud'])
y = Transaction['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLflow

In [8]:
!pip install mlflow dagshub
import mlflow
import dagshub

dagshub.init(repo_owner='Givi-Modebadze', repo_name='Fraud_Detection_ML', mlflow=True)

experiment = "Fraud Detection Random Forest"
mlflow.set_experiment(experiment)

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=38449c74-1fa5-46ab-960b-d1379ad22095&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=4bb9204f96935e0e7355abe2558d9a12c1f495b914d803f173b18c5a6e532623




Output()

<Experiment: artifact_location='mlflow-artifacts:/6bfbe8f324c84d13b79feee3f26f25ba', creation_time=1745677502700, experiment_id='1', last_update_time=1745677502700, lifecycle_stage='active', name='Fraud Detection Random Forest', tags={}>

# Cleaning

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, drop_threshold: float):
        self.drop_threshold = drop_threshold
        self.cols_to_drop_ = []
        self.fill_values = {}

    def fit(self, X, y = None):
        self.drop_cols = [col for col in X.columns if X[col].isna().mean() >= self.drop_threshold]
        self.drop_cols.append('TransactionID')

        X_filtered = X.drop(columns=self.drop_cols)

        for col in X_filtered.columns:
            mode = X_filtered[col].mode()
            self.fill_values[col] = mode[0] if not mode.empty else np.nan
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_cols)
        for col, value in self.fill_values.items():
            X[col] = X[col].fillna(value)
        return X

In [22]:
data_cleaner = DataCleaner(drop_threshold=0.9)

data_cleaner.fit(X_train, y_train)

X_cleaned = data_cleaner.transform(X_train)

with mlflow.start_run(run_name="Random_Forest_Clean") as run:
    mlflow.sklearn.log_model(
        sk_model=data_cleaner,
        artifact_path="data_cleaner_model",
        registered_model_name="DataCleaner",
    )
    mlflow.log_param("drop_threshold", 0.9)
    mlflow.log_param("n_columns_after_cleaning", X_cleaned.shape[1])

Registered model 'DataCleaner' already exists. Creating a new version of this model...
2025/04/26 20:34:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DataCleaner, version 7
Created version '7' of model 'DataCleaner'.


🏃 View run Random_Forest_Clean at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/28a221931335475fa0fd0dd7bfec4ddd
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


# Feature Engineer

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y = None):
        for col in X.select_dtypes(include=['object', 'category']).columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        for col, le in self.encoders.items():
            X[col] = le.transform(X[col])
        return X

In [24]:
from sklearn.pipeline import Pipeline

feature_engineer = Pipeline([
    ('feature_engineering', FeatureEngineer())
])

feature_engineer.fit(X_cleaned, y_train)

X_engineered = feature_engineer.transform(X_cleaned)

with mlflow.start_run(run_name="Random_Forest_Feature_Engineering") as run:
    mlflow.sklearn.log_model(
        sk_model=feature_engineer,
        artifact_path="feature_engineer_model",
        registered_model_name="FeatureEngineer",
    )

    n_label_encoded_cols = len(X_cleaned.select_dtypes(include=['object', 'category']).columns)
    n_numeric_cols = len(X_cleaned.select_dtypes(include=['number']).columns)
    total_cols = X_cleaned.shape[1]
    
    mlflow.log_param("n_label_encoded_columns", n_label_encoded_cols)
    mlflow.log_param("n_numeric_columns_scaled", n_numeric_cols)
    mlflow.log_param("total_columns", total_cols)

Successfully registered model 'FeatureEngineer'.
2025/04/26 20:35:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeatureEngineer, version 1
Created version '1' of model 'FeatureEngineer'.


🏃 View run Random_Forest_Feature_Engineering at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/94632f3a004f4abf8d3e261f5d291937
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


# Feature Selection

# Training

In [12]:
X_train.shape

(472432, 393)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/28a221931335475fa0fd0dd7bfec4ddd/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/94632f3a004f4abf8d3e261f5d291937/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('random_forest', RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [27]:
from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Random_Forest_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="random_forest_pipeline_model",
        registered_model_name="RandomForestPipelineModel"
    )
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("min_samples_leaf", 5)
    mlflow.log_param("n_jobs", -1)
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Successfully registered model 'RandomForestPipelineModel'.
2025/04/26 20:46:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestPipelineModel, version 1
Created version '1' of model 'RandomForestPipelineModel'.


ROC AUC Score Train: 0.8696900749965096
ROC AUC Score Test: 0.8711273991605457
🏃 View run Random_Forest_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/f8737abb1735417584a608eeb01d5a3e
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


# Training 2

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/28a221931335475fa0fd0dd7bfec4ddd/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/94632f3a004f4abf8d3e261f5d291937/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('random_forest', RandomForestClassifier(n_estimators=1000, max_depth=15, min_samples_leaf=8, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

with mlflow.start_run(run_name="Random_Forest_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="random_forest_pipeline_model",
        registered_model_name="RandomForestPipelineModel"
    )
    mlflow.log_param("n_estimators", 1000)
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("n_jobs", -1)
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'RandomForestPipelineModel' already exists. Creating a new version of this model...
2025/04/26 21:04:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestPipelineModel, version 2
Created version '2' of model 'RandomForestPipelineModel'.


ROC AUC Score Train: 0.8696900749965096
ROC AUC Score Test: 0.8711273991605457
🏃 View run Random_Forest_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/705bb424091145c7b9598ab013bd6957
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


# Training 3

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/28a221931335475fa0fd0dd7bfec4ddd/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/94632f3a004f4abf8d3e261f5d291937/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('random_forest', RandomForestClassifier(n_estimators=1200, max_depth=20, min_samples_leaf=5, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

with mlflow.start_run(run_name="Random_Forest_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="random_forest_pipeline_model",
        registered_model_name="RandomForestPipelineModel"
    )
    mlflow.log_param("n_estimators", 1200)
    mlflow.log_param("max_depth", 20)
    mlflow.log_param("min_samples_leaf", 5)
    mlflow.log_param("n_jobs", -1)
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'RandomForestPipelineModel' already exists. Creating a new version of this model...
2025/04/26 21:31:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestPipelineModel, version 3
Created version '3' of model 'RandomForestPipelineModel'.


ROC AUC Score Train: 0.8696900749965096
ROC AUC Score Test: 0.8711273991605457
🏃 View run Random_Forest_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/bdcbff1ce0ac4a4f925f754a11900fba
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


In [31]:
print("ROC AUC Score Train:", roc_auc_train)
print("ROC AUC Score Test:", roc_auc_test)

ROC AUC Score Train: 0.9345901687643265
ROC AUC Score Test: 0.9113152675726358


In [33]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/28a221931335475fa0fd0dd7bfec4ddd/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/94632f3a004f4abf8d3e261f5d291937/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('random_forest', RandomForestClassifier(n_estimators=1200, max_depth=20, min_samples_leaf=8, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

with mlflow.start_run(run_name="Random_Forest_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="random_forest_pipeline_model",
        registered_model_name="RandomForestPipelineModel"
    )
    mlflow.log_param("n_estimators", 1200)
    mlflow.log_param("max_depth", 20)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("n_jobs", -1)
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'RandomForestPipelineModel' already exists. Creating a new version of this model...
2025/04/26 22:05:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestPipelineModel, version 4
Created version '4' of model 'RandomForestPipelineModel'.


ROC AUC Score Train: 0.9315969503255975
ROC AUC Score Test: 0.9100093784605482
🏃 View run Random_Forest_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/d8e99cdde3d946dd8da5c978a9b5ee4a
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1


In [34]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/28a221931335475fa0fd0dd7bfec4ddd/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/94632f3a004f4abf8d3e261f5d291937/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('random_forest', RandomForestClassifier(n_estimators=1200, max_depth=16, min_samples_leaf=6, n_jobs=-1, random_state=42))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

with mlflow.start_run(run_name="Random_Forest_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="random_forest_pipeline_model",
        registered_model_name="RandomForestPipelineModel"
    )
    mlflow.log_param("n_estimators", 1200)
    mlflow.log_param("max_depth", 16)
    mlflow.log_param("min_samples_leaf", 6)
    mlflow.log_param("n_jobs", -1)
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'RandomForestPipelineModel' already exists. Creating a new version of this model...
2025/04/26 22:31:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestPipelineModel, version 5
Created version '5' of model 'RandomForestPipelineModel'.


ROC AUC Score Train: 0.9097375212209287
ROC AUC Score Test: 0.8968485339140668
🏃 View run Random_Forest_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1/runs/adf09cf2f3a34a5780215542cbc9aec0
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/1
