In [23]:
import pandas as pd
import xgboost as xgb
import numpy as np
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, make_scorer
from sklearn.preprocessing import minmax_scale, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold,TimeSeriesSplit, StratifiedKFold
from xgboost import plot_importance
from hyperopt import hp, fmin, tpe, Trials, space_eval

import time

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / (1024 ** 2) 
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / (1024 ** 2)
    print(f"start_mem Memory usage of df_train: {start_mem:.2f} MB")
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [57]:
df_train_trans = pd.read_csv('../input/train_transaction.csv')
#df_test_trans = pd.read_csv('../input/test_transaction.csv')

df_train_identity = pd.read_csv('../input/train_identity.csv')
#df_test_identity = pd.read_csv('../input/test_identity.csv')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

df_train = df_train_trans.merge(df_train_identity, how='left', on='TransactionID')
#df_test = df_test_trans.merge(df_test_identity, how='left', on='TransactionID')

print(df_train.shape)
#print(df_test.shape)

# Standardizing column names
df_train.columns = df_train.columns.str.replace('-', '_')
#df_test.columns = df_test.columns.str.replace('-', '_')

del df_train_trans, df_train_identity


(590540, 434)


In [58]:
for col in df_train.select_dtypes(include='object').columns:
    lbl = LabelEncoder()
    df_train[col] = lbl.fit_transform(df_train[col])  # Apply label encoding

df_train.fillna(-999, inplace=True)


In [59]:
# for col in df_train.columns:
#     df_train[col] = df_train[col].fillna((df_train[col].min() - 2))
#     df_train[col] = (minmax_scale(df_train[col], feature_range=(0,1)))

pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(df_train)
df_train_pca = pd.DataFrame(X_train_pca)

In [60]:
# Reduce memory usage
df_train_pca = reduce_mem_usage(df_train_pca)

start_mem Memory usage of df_train: 135.16 MB
Mem. usage decreased to 57.44 Mb (57.5% reduction)


In [61]:
if 'TransactionDT' in df_train_pca.columns:
    print("Column exists!")
else:
    print("Column does not exist.")

Column does not exist.


In [62]:
X_train = df_train.sort_values('TransactionDT').drop(['isFraud', 
                                                      'TransactionDT', 
                                                      #'Card_ID'
                                                     ],
                                                     axis=1)
y_train = df_train.sort_values('TransactionDT')['isFraud'].astype(bool)

del df_train

In [None]:
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1
    tss = TimeSeriesSplit(n_splits=FOLDS)

    score_mean = 0
    for tr_idx, val_idx in tss.split(X_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, 
            random_state=4, 
            tree_method='hist', 
            device='cuda', 
            eval_metric='auc',
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict_proba(X_vl)[:, 1]  # Get the predicted probabilities for the positive class
        score = roc_auc_score(y_vl, y_pred)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)


space = {
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
}

In [64]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=27)

# Print best parameters
best_params = space_eval(space, best)

                                                      
############## New Run ################
params = {'max_depth': 8, 'gamma': '0.699', 'subsample': '0.60', 'reg_alpha': '0.389', 'reg_lambda': '0.181', 'learning_rate': '0.109', 'colsample_bytree': '0.675'}
1 CV - score: 0.8977                                  
2 CV - score: 0.8998                                  
3 CV - score: 0.9179                                  
4 CV - score: 0.8836                                  
5 CV - score: 0.928                                   
6 CV - score: 0.9226                                  
7 CV - score: 0.9133                                  
Total Time Run: 2.16                                  
Mean ROC_AUC: 0.9089630425826669                      
                                                                                  
############## New Run ################
params = {'max_depth': 16, 'gamma': '0.132', 'subsample': '0.40', 'reg_alpha': '0.096', 'reg_lambda': '0.167', 'learning_r

In [66]:
print("BEST PARAMS: ", best_params)
best_params['max_depth'] = int(best_params['max_depth'])

clf = xgb.XGBClassifier(
    n_estimators=300,
    **best_params,
    tree_method='hist', 
    device='cuda', 
)

clf.fit(X_train, y_train)

BEST PARAMS:  {'colsample_bytree': 0.8504927104377381, 'gamma': 0.6969634040343395, 'learning_rate': 0.010993668306876906, 'max_depth': 19, 'reg_alpha': 0.06396091853149047, 'reg_lambda': 0.2590808554804609, 'subsample': 0.5}


In [67]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(20)

Unnamed: 0,score
TransactionID,27160.0
card1,25252.0
card2,19973.0
TransactionAmt,19888.0
addr1,15827.0
card5,8915.0
P_emaildomain,8634.0
D15,8621.0
C13,8312.0
id_02,7135.0


In [None]:
# If you have a validation set:
y_val_pred = clf.predict(X_val)  # Predict class labels for validation data
y_val_pred_proba = clf.predict_proba(X_val)[:, 1]  # Predict probabilities for ROC-AUC

# Evaluate model on validation data
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_proba)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Print validation metrics
print("\nFinal Model Performance on Validation Data:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"ROC AUC Score: {val_roc_auc:.4f}")
print("\nConfusion Matrix:")
print(val_conf_matrix)

# Print Classification Report
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

# Confusion Matrix visualization
cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()  