In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fraud-detection/sample_submission.csv
/kaggle/input/fraud-detection/test_identity.csv
/kaggle/input/fraud-detection/train_identity.csv
/kaggle/input/fraud-detection/test_transaction.csv
/kaggle/input/fraud-detection/train_transaction.csv


In [5]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [6]:
Identity = pd.read_csv('/kaggle/input/fraud-detection/train_identity.csv')
Transaction = pd.read_csv('/kaggle/input/fraud-detection/train_transaction.csv')

In [17]:
pd.set_option('display.max_columns', None)

print(list(Transaction.columns))

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

In [8]:
Identity.columns

Index(['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06',
       'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14',
       'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22',
       'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object')

In [9]:
df = Transaction.merge(Identity, how='left', left_on='TransactionID', right_on='TransactionID')

In [18]:
df.shape, Transaction.shape

((590540, 434), (590540, 394))

In [22]:
from sklearn.model_selection import GroupShuffleSplit

X = Transaction.drop(columns=['isFraud'])
y = Transaction['isFraud']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)

split = gss.split(X, y, groups=X['card1'].astype(str) + "_" + X['addr1'].astype(str))

train_idx, test_idx = next(split)

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Mlflow

In [26]:
!pip install mlflow dagshub
import mlflow
import dagshub

dagshub.init(repo_owner='Givi-Modebadze', repo_name='Fraud_Detection_ML', mlflow=True)

experiment = "Fraud Detection Logistic Regression"
mlflow.set_experiment(experiment)



<Experiment: artifact_location='mlflow-artifacts:/5b7e912fa380435c841b92ebb611fa9f', creation_time=1745661718738, experiment_id='0', last_update_time=1745661718738, lifecycle_stage='active', name='Fraud Detection Logistic Regression', tags={}>

# Cleaning

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, drop_threshold: float):
        self.drop_threshold = drop_threshold
        self.cols_to_drop_ = []
        self.fill_values = {}

    def fit(self, X, y = None):
        self.drop_cols = [col for col in X.columns if X[col].isna().mean() >= self.drop_threshold]

        X_filtered = X.drop(columns=self.drop_cols)

        for col in X_filtered.columns:
            mode = X_filtered[col].mode()
            self.fill_values[col] = mode[0] if not mode.empty else np.nan
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_cols)
        for col, value in self.fill_values.items():
            X[col] = X[col].fillna(value)
        return X

In [31]:
data_cleaner = DataCleaner(drop_threshold=0.9)

data_cleaner.fit(X_train, y_train)

X_cleaned = data_cleaner.transform(X_train)

with mlflow.start_run(run_name="Logistic_Regression_Clean") as run:
    mlflow.sklearn.log_model(
        sk_model=data_cleaner,
        artifact_path="data_cleaner_model",
        registered_model_name="DataCleaner",
    )
    mlflow.log_param("drop_threshold", 0.9)
    mlflow.log_param("n_columns_after_cleaning", X_cleaned.shape[1])

Registered model 'DataCleaner' already exists. Creating a new version of this model...
2025/04/26 17:47:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DataCleaner, version 5
Created version '5' of model 'DataCleaner'.


🏃 View run Logistic_Regression_Clean at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0/runs/7b104951fb504efca542f6b36578a3ef
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0


# Feature Engineering

In [33]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y = None):
        for col in X.select_dtypes(include=['object', 'category']).columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        for col, le in self.encoders.items():
            X[col] = le.transform(X[col])
        return X


In [34]:
from sklearn.pipeline import Pipeline

feature_engineer_scaler = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('scaler', StandardScaler())
])

feature_engineer_scaler.fit(X_cleaned, y_train)

X_engineered_scaled = feature_engineer_scaler.transform(X_cleaned)

with mlflow.start_run(run_name="Logistic_Regression_Feature_Engineering_Scaler") as run:
    mlflow.sklearn.log_model(
        sk_model=feature_engineer_scaler,
        artifact_path="feature_engineer_scaler_model",
        registered_model_name="FeatureEngineerScaler",
    )

    n_label_encoded_cols = len(X_cleaned.select_dtypes(include=['object', 'category']).columns)
    n_numeric_cols = len(X_cleaned.select_dtypes(include=['number']).columns)
    total_cols = X_cleaned.shape[1]
    
    mlflow.log_param("n_label_encoded_columns", n_label_encoded_cols)
    mlflow.log_param("n_numeric_columns_scaled", n_numeric_cols)
    mlflow.log_param("total_columns", total_cols)

Registered model 'FeatureEngineerScaler' already exists. Creating a new version of this model...
2025/04/26 17:51:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeatureEngineerScaler, version 2
Created version '2' of model 'FeatureEngineerScaler'.


🏃 View run Logistic_Regression_Feature_Engineering_Scaler at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0/runs/d128ab90994e4693bf10780426a3c05b
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0


# Feature Selection

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, mutual_info_classif

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k: int = 20):
        self.k = k
        self.selector = None

    def fit(self, X, y):
        self.selector = SelectKBest(score_func=mutual_info_classif, k=self.k)
        self.selector.fit(X, y)
        return self

    def transform(self, X):
        return self.selector.transform(X)

    def get_support(self):
        return self.selector.get_support()


In [41]:
feature_selector = FeatureSelector(k=100)
feature_selector.fit(X_engineered_scaled, y_train)

X_selected = feature_selector.transform(X_engineered_scaled)

with mlflow.start_run(run_name="Logistic_Regression_Feature_Selection") as run:
    mlflow.sklearn.log_model(
        sk_model=feature_selector,
        artifact_path="feature_selector_model",
        registered_model_name="FeatureSelector",
    )

    mlflow.log_param("k_best_features", feature_selector.k)
    mlflow.log_param("input_features_before_selection", X_engineered_scaled.shape[1])
    mlflow.log_param("output_features_after_selection", X_selected.shape[1])

Registered model 'FeatureSelector' already exists. Creating a new version of this model...
2025/04/26 18:44:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FeatureSelector, version 2
Created version '2' of model 'FeatureSelector'.


🏃 View run Logistic_Regression_Feature_Selection at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0/runs/353ea56d06bb4a688ccb91829dc03520
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0


# Training

In [43]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, solver='saga')
model.fit(X_selected, y_train)

with mlflow.start_run(run_name="Logistic_Regression_Model") as run:
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="logistic_regression_model",
        registered_model_name="LogisticRegressionModel",
    )

Registered model 'LogisticRegressionModel' already exists. Creating a new version of this model...
2025/04/26 19:01:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegressionModel, version 3
Created version '3' of model 'LogisticRegressionModel'.


🏃 View run Logistic_Regression_Model at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0/runs/2270d0202c4a4949a37775a3871bb109
🧪 View experiment at: https://dagshub.com/Givi-Modebadze/Fraud_Detection_ML.mlflow/#/experiments/0


# Prediction

In [44]:
import mlflow.sklearn

data_cleaner = mlflow.sklearn.load_model('runs:/7b104951fb504efca542f6b36578a3ef/data_cleaner_model')
feature_engineer_scaler = mlflow.sklearn.load_model('runs:/d128ab90994e4693bf10780426a3c05b/feature_engineer_scaler_model')
feature_selector = mlflow.sklearn.load_model('runs:/353ea56d06bb4a688ccb91829dc03520/feature_selector_model')
log_reg = mlflow.sklearn.load_model('runs:/2270d0202c4a4949a37775a3871bb109/logistic_regression_model')

X_test_cleaned = data_cleaner.transform(X_test)
X_test_engineered_scaled = feature_engineer_scaler.transform(X_test_cleaned)
X_test_selected = feature_selector.transform(X_test_engineered_scaled)

y_prob = log_reg.predict_proba(X_test_selected)[:, 1]

y_pred = log_reg.predict(X_test_selected)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [47]:
from sklearn.metrics import roc_auc_score

roc_score = roc_auc_score(y_test, y_prob)
print("ROC AUC Score:", roc_score)

ROC AUC Score: 0.8080556153733282
