In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
%pip install mlflow
%pip install dagshub

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_curve, roc_auc_score
from sklearn.feature_selection import RFE
import xgboost as xgb
import warnings
import time
from scipy import stats
import category_encoders as ce

In [5]:
y = df['isFraud']
X = df.drop('isFraud', axis=1)

In [6]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_validation, y_train, y_validation = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

In [7]:
class Cleaning_Data(BaseEstimator, TransformerMixin):
    def _init_(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'TransactionID' in X.columns:
            X = X.drop('TransactionID', axis=1)

        null_procentage = (X.isnull().sum() / len(X)) * 100
        bad_cols = null_procentage[null_procentage >= 50].index
        X = X.drop(bad_cols, axis=1)

        cat_col = X.select_dtypes(include=['object']).columns
        num_col = X.select_dtypes(include=['float64', 'int64']).columns

        # Fill categorical columns with mode 
        for col in cat_col:
            mode_val = X[col].mode()[0]
            X[col] = X[col].fillna(mode_val)

        # Fill numerical columns with median 
        for col in num_col:
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)

        return X

In [8]:
class Feature_Engineering(BaseEstimator, TransformerMixin):
    def __init__(self):
       
        self.woe_encoder = None  # Will store the trained WOE encoder
        self.categorical_features = None  # Will store names of categorical columns

    def fit(self, X, y):
        
        
        self.categorical_features = list(X.select_dtypes(include=['object']).columns)
        
        # Initialize and train WOE encoder on these columns
        self.woe_encoder = ce.WOEEncoder(cols=self.categorical_features)
        self.woe_encoder.fit(X, y)
        
        return self  

    def transform(self, X):
        
        
        return self.woe_encoder.transform(X)

In [9]:
class Correlation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05):
        self.threshold = threshold
        self.selected_features = []

    def _compute_feature_correlations(self, X, y):
        correlations = []
        for feature in X.columns:
            corr = np.abs(X[feature].corr(y))
            correlations.append((feature, corr))
        return pd.DataFrame(correlations, columns=["Feature", "Correlation"])

    def _select_features(self, correlation_df):
        filtered = correlation_df[correlation_df['Correlation'] > self.threshold]
        return filtered.sort_values('Correlation', ascending=False)['Feature'].tolist()

    def fit(self, X, y):
        correlation_df = self._compute_feature_correlations(X, y)
        self.selected_features = self._select_features(correlation_df)
        return self

    def transform(self, X):
        if not self.selected_features:
            raise ValueError("No features selected. Did you forget to call fit()?")
        return X[self.selected_features]

In [18]:
from sklearn.ensemble import AdaBoostClassifier

#ada_model = AdaBoostClassifier(
 #   n_estimators=100,
  #  learning_rate=0.8,
   # random_state=42
#)

ada_model = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=0.1,  
    random_state=42
)

pipeline = Pipeline([
    ('clean', Cleaning_Data()),
    ('feature', Feature_Engineering()),
    ('select', Correlation()),
    ('model', ada_model)
])

pipeline.fit(X_train, y_train)

y_train_pred_proba = pipeline.predict_proba(X_train)[:, 1]
y_val_pred_proba = pipeline.predict_proba(X_validation)[:, 1]

val_auc = roc_auc_score(y_validation, y_val_pred_proba)

train_auc = roc_auc_score(y_train, y_train_pred_proba)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_pred_proba)


In [19]:
print(f"Validation AUC: {val_auc:.4f}")

print(f"Train AUC: {train_auc:.4f}")

print(f"Test ROC AUC: {test_auc:.4f}")

Validation AUC: 0.8073
Train AUC: 0.8044
Test ROC AUC: 0.8056


In [20]:
import dagshub
import mlflow


dagshub.init(repo_owner='Lodia15', repo_name='machineLearningLodia-HW2', mlflow=True)
mlflow.set_experiment("AdaBoost_Training")

#ada_model = AdaBoostClassifier(
 #       n_estimators=100,
  #      learning_rate=0.8,
   #     random_state=42
    #)

ada_model = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=0.1,  
    random_state=42
)
    
pipeline = Pipeline([
    ('clean', Cleaning_Data()),
    ('feature', Feature_Engineering()),
    ('select', Correlation()),
    ('model', ada_model)
])

with mlflow.start_run(run_name="AdaBoost_Experiment"):
    pipeline.fit(X_train, y_train)
    
    
    y_test_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_test_pred_proba)  

    mlflow.log_metric("Test ROC AUC", auc)


    mlflow.log_param("model_type", "AdaBoost")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.1)
    
    mlflow.log_metric("Test ROC AUC", auc)
    
    mlflow.sklearn.log_model(pipeline, artifact_path="pipeline_model")

print(f"Logged AUC: {auc:.4f}")



🏃 View run AdaBoost_Experiment at: https://dagshub.com/Lodia15/machineLearningLodia-HW2.mlflow/#/experiments/2/runs/bf2f31b4aa02453fa6446649de84ecae
🧪 View experiment at: https://dagshub.com/Lodia15/machineLearningLodia-HW2.mlflow/#/experiments/2
Logged AUC: 0.8056
