In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample_submission.csv
/kaggle/input/test_identity.csv
/kaggle/input/train_identity.csv
/kaggle/input/test_transaction.csv
/kaggle/input/train_transaction.csv


In [4]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [5]:
Identity = pd.read_csv('/kaggle/input/train_identity.csv')
Transaction = pd.read_csv('/kaggle/input/train_transaction.csv')

In [17]:
Identity.shape, Transaction.shape

((144233, 41), (590540, 394))

In [6]:
df = Transaction.merge(Identity, how='left', left_on='TransactionID', right_on='TransactionID')

In [18]:
from sklearn.model_selection import GroupShuffleSplit

X = df.drop(columns=['isFraud'])
y = df['isFraud']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)

split = gss.split(X, y, groups=X['card1'].astype(str) + "_" + X['addr1'].astype(str))

train_idx, test_idx = next(split)

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# MLflow

In [19]:
!pip install mlflow dagshub
import mlflow
import dagshub

dagshub.init(repo_owner='Givi-Modebadze', repo_name='Fraud_Detection_ML', mlflow=True)

experiment = "Fraud Detection Decision Tree"
mlflow.set_experiment(experiment)



<Experiment: artifact_location='mlflow-artifacts:/3930a3399b6a43669089262e5d0f0bc3', creation_time=1745754651360, experiment_id='2', last_update_time=1745754651360, lifecycle_stage='active', name='Fraud Detection Decision Tree', tags={}>

# Cleaning

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, drop_threshold: float):
        self.drop_threshold = drop_threshold
        self.cols_to_drop_ = []
        self.fill_values = {}

    def fit(self, X, y = None):
        self.drop_cols = [col for col in X.columns if X[col].isna().mean() >= self.drop_threshold]
        self.drop_cols.append('TransactionID')

        X_filtered = X.drop(columns=self.drop_cols)

        for col in X_filtered.columns:
            mode = X_filtered[col].mode()
            self.fill_values[col] = mode[0] if not mode.empty else np.nan
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_cols)
        for col, value in self.fill_values.items():
            X[col] = X[col].fillna(value)
        return X

In [21]:
data_cleaner = DataCleaner(drop_threshold=0.8)

data_cleaner.fit(X_train, y_train)

X_cleaned = data_cleaner.transform(X_train)

with mlflow.start_run(run_name="Decision_Tree_Clean") as run:
    mlflow.sklearn.log_model(
        sk_model=data_cleaner,
        artifact_path="data_cleaner_model",
        registered_model_name="DataCleaner",
    )
    mlflow.log_param("drop_threshold", 0.8)
    mlflow.log_param("n_columns_after_cleaning", X_cleaned.shape[1])

Registered model 'DataCleaner' already exists. Creating a new version of this model...
2025/04/27 11:57:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DataCleaner, version 9
Created version '9' of model 'DataCleaner'.


🏃 View run Decision_Tree_Clean at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/47787478acf44753bd12d69d9179d573
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Feature Engineer

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = None

    def fit(self, X, y=None):
        cat_cols = X.select_dtypes(include=['object', 'category']).columns
        self.cat_cols = cat_cols
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.encoder.fit(X[cat_cols])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.cat_cols] = self.encoder.transform(X[self.cat_cols])
        return X

In [29]:
from sklearn.pipeline import Pipeline

feature_engineer = Pipeline([
    ('feature_engineering', FeatureEngineer())
])

feature_engineer.fit(X_cleaned, y_train)

X_engineered = feature_engineer.transform(X_cleaned)

with mlflow.start_run(run_name="Decision_Tree_Feature_Engineering") as run:
    mlflow.sklearn.log_model(
        sk_model=feature_engineer,
        artifact_path="feature_engineer_model",
        registered_model_name="FeatureEngineer",
    )

    n_label_encoded_cols = len(X_cleaned.select_dtypes(include=['object', 'category']).columns)
    n_numeric_cols = len(X_cleaned.select_dtypes(include=['number']).columns)
    total_cols = X_cleaned.shape[1]
    
    mlflow.log_param("n_label_encoded_columns", n_label_encoded_cols)
    mlflow.log_param("n_numeric_columns_scaled", n_numeric_cols)
    mlflow.log_param("total_columns", total_cols)

Registered model 'FeatureEngineer' already exists. Creating a new version of this model...
2025/04/27 12:10:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeatureEngineer, version 4
Created version '4' of model 'FeatureEngineer'.


🏃 View run Decision_Tree_Feature_Engineering at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/ae91db9f5c8845f484b7eb28061e6f8c
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Feature Selection

# Training

In [24]:
X_train.shape

(463215, 433)

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=10,
        min_samples_leaf=5,
        min_samples_split=10,
        max_features='sqrt',
        criterion='gini',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("min_samples_leaf", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("criterion", 'gini')
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Successfully registered model 'DecisionTreePipelineModel'.
2025/04/27 12:11:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 1
Created version '1' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8184948865789854
ROC AUC Score Test: 0.7912664038766588
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/5b23b84e3e5243c8a83bc78e06621915
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 2

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=8,
        min_samples_split=10,
        criterion='gini',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("criterion", 'gini')
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:15:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 2
Created version '2' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8690824305914284
ROC AUC Score Test: 0.7985271400484512
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/eae9d675a0b4471e9f3581a16974094b
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 3

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=20,
        min_samples_leaf=5,
        min_samples_split=10,
        max_features='sqrt',
        criterion='gini',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 20)
    mlflow.log_param("min_samples_leaf", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("criterion", 'gini')
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:21:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 3
Created version '3' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.9058265213933638
ROC AUC Score Test: 0.782733968754605
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/9227e8bfd11d4011b518477796f4f8c7
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 4

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=5,
        min_samples_split=8,
        max_features='sqrt',
        criterion='gini',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 5)
    mlflow.log_param("min_samples_split", 8)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("criterion", 'gini')
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:24:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 4
Created version '4' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8692713293288453
ROC AUC Score Test: 0.7898875434629647
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/a36cf1a58b9f41138d185e7780d8650b
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 5

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=8,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:26:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 5
Created version '5' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8704950980485336
ROC AUC Score Test: 0.813433829748168
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/58b61ea90571492f94d5d3e421011bb0
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 6

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=20,
        min_samples_leaf=8,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 20)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:30:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 6
Created version '6' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.9061623788083695
ROC AUC Score Test: 0.7700469016716962
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/2986d039c6a948379f1e695e194384b1
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 7

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=12,
        min_samples_leaf=8,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 12)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:32:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 7
Created version '7' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8345420541445938
ROC AUC Score Test: 0.8058704399498752
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/895906ab6f4145d383cf6153841d3b66
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


# Training 8

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=16,
        min_samples_leaf=8,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 16)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 8
Created version '8' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8670786398407254
ROC AUC Score Test: 0.7955477165421392
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/6b5487ff42704ef8b5c2875b51ec6468
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=8,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 8)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:40:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 9
Created version '9' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8704950980485336
ROC AUC Score Test: 0.813433829748168
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/a1706d823eb14a8a81d155325bf15cb4
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


In [39]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=10,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 10)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:43:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 10
Created version '10' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8719521610989219
ROC AUC Score Test: 0.808011658208018
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/b6b1b6d468814164bc304a3cf8ac1f18
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2


In [40]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/47787478acf44753bd12d69d9179d573/data_cleaner_model')
feature_engineer = mlflow.sklearn.load_model('runs:/ae91db9f5c8845f484b7eb28061e6f8c/feature_engineer_model')

pipeline = Pipeline([
    ('cleaner', data_cleaner),
    ('feature_engineering', feature_engineer),
    ('decision_tree', DecisionTreeClassifier(
        max_depth=15,
        min_samples_leaf=10,
        min_samples_split=10,
        max_features='sqrt',
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]
y_prob_test = pipeline.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_auc_score

with mlflow.start_run(run_name="Decision_Tree_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="decision_tree_pipeline_model",
        registered_model_name="DecisionTreePipelineModel"
    )
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("min_samples_leaf", 10)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("max_features", "sqrt")
    mlflow.log_param("random_state", 42)

    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)

    mlflow.log_metric("roc_auc_train", roc_auc_train)
    mlflow.log_metric("roc_auc_test", roc_auc_test)
    
    print("ROC AUC Score Train:", roc_auc_train)
    print("ROC AUC Score Test:", roc_auc_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Registered model 'DecisionTreePipelineModel' already exists. Creating a new version of this model...
2025/04/27 12:45:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreePipelineModel, version 11
Created version '11' of model 'DecisionTreePipelineModel'.


ROC AUC Score Train: 0.8719521610989219
ROC AUC Score Test: 0.808011658208018
🏃 View run Decision_Tree_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2/runs/0dd63c20a1a9466da68284e95f052bfd
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/2
