**Table of contents**<a id='toc0_'></a>    
- [Feature selection](#toc1_1_)    
    - [Principle Component Analysis](#toc1_1_1_)    
    - [Feature importance from tree-based models (e.g., Random Forest)](#toc1_1_2_)    
- [model](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import display, HTML
import sys
import os

import warnings
warnings.filterwarnings("ignore")
sys.path.append('../src/')

import utility
from utility import *
from sklearn.preprocessing import LabelEncoder


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

display(HTML("<style>.container { width:50% !important; }</style>"))
#display(HTML('<style>.output { max-width:800px !important; }</style>')) # control output width
display(HTML("<style>.container { width:130% !important; }</style>"))

pd.set_option('display.max_columns', None)   # Show number of columns, None means show all columns
pd.set_option('display.max_rows', 50)        # show top and bottom 15, middle with ......
pd.set_option('display.width', None)         # Set the width of the display
pd.set_option('display.max_colwidth', None)  # Show full content of each column

In [2]:
credit_card_fraud =  combine_csv_file('https://raw.githubusercontent.com/KevinJianLin/credit_Card_Fraud_Detection/refs/heads/main/Data/df_',6)
data_set = credit_card_fraud.copy()
data_set.head(5)
data_profile = data_profiling(data_set)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
target_variable = 'Class' 
X = data_set[data_profile.float_column]
y = data_set[target_variable]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)



## <a id='toc1_1_'></a>[Feature selection](#toc0_)

### <a id='toc1_1_1_'></a>[Principle Component Analysis](#toc0_)

### <a id='toc1_1_2_'></a>[Feature importance from tree-based models (e.g., Random Forest)](#toc0_)
Permutation importance
SHAP (SHapley Additive exPlanations) values
Recursive feature elimination
Lasso regularization

# <a id='toc2_'></a>[model](#toc0_)

In [4]:
preprocessor = ColumnTransformer(transformers = [
                                   #         ('text', TfidfVectorizer(max_features=500), text_col),
                                               ('float_col',StandardScaler(),data_profile.float_column),
                                          #('int_col',StandardScaler(), data_profile.int_column[3:-1]),
                                          # ('int_col',StandardScaler(), data_profile.int_column[0:3]),
                                        #    ('cat_col', OneHotEncoder(), cat_col),
                                            ]) 

In [5]:
best_model ={}
results = {}

# Loop through models and apply GridSearchCV
for model_name, model_infor in model_parameters_classification.items():


    # Create a pipeline for each model
    pipeline = Pipeline([('preprocessor',preprocessor),
                         ('classifier',model_infor['model'])])
    
    # Initialize GridSearchCV with model and parameter grid
    grid_search = GridSearchCV(estimator=pipeline,
                                param_grid = model_infor['params'],
                                cv = 5, # 5 fold cv
                                scoring = 'accuracy', # f1 etc
                                verbose =0, # not to display output
                                n_jobs = -1 # use all of cores
                                )
    model_trained = grid_search.fit(X_train, y_train)

    best_model[model_name] = model_trained.best_estimator_
    #print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

    y_pred       = model_trained.predict(X_test)
    y_pred_proba = model_trained.predict_proba(X_test)[:, 1] # Extract probabilities for class 1



    # Basic Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred) 
    recall = recall_score(y_test, y_pred) # 
    f1 = f1_score(y_test, y_pred) # 

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Probabilistic Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    logloss = log_loss(y_test, y_pred_proba)

    # Advanced Metrics
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    results[model_name] = {'accuracy': accuracy,
                           'precision': precision,
                           'recall':recall,
                           'f1': f1,
                           'logloss': logloss,
                           'mcc': mcc,
                           'kappa': kappa,
                           'balanced_acc': balanced_acc,
                           'Best Params': grid_search.best_params_
                           }

    results_df = pd.DataFrame(results).T  # Transpose for readability


    # print(f"Classification Report for {model_name}:\n")
    # print(classification_report(y_test, y_pred, target_names = label_encoder.classes_))

    #print(model_infor['model'])
results_df   

Unnamed: 0,accuracy,precision,recall,f1,logloss,mcc,kappa,balanced_acc,Best Params
decisiontree_classifier,0.998821,0.70904,0.537473,0.611449,0.040194,0.616758,0.61087,0.768546,"{'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2}"
randomforest_classifier,0.999124,0.857143,0.591006,0.69962,0.013427,0.711346,0.699196,0.795418,"{'classifier__bootstrap': True, 'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__max_features': None, 'classifier__max_samples': 0.2, 'classifier__n_estimators': 10}"
adaboost_classifier,0.999054,0.80758,0.593148,0.683951,0.170893,0.691665,0.683488,0.796452,"{'classifier__algorithm': 'SAMME', 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}"
catboost_classifier,0.999202,0.851541,0.650964,0.737864,0.003797,0.744152,0.737471,0.825384,"{'classifier__depth': 4, 'classifier__l2_leaf_reg': 3, 'classifier__learning_rate': 0.01}"
xgboost,0.999268,0.868493,0.678801,0.762019,0.003714,0.767465,0.761658,0.839312,"{'classifier__colsample_bytree': 1, 'classifier__eta': 0.1, 'classifier__gamma': 0, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__subsample': 1.0}"
lgb_classifier,0.999213,0.832461,0.680942,0.749117,0.008741,0.752519,0.748726,0.840353,"{'classifier__bagging_fraction': 0.5, 'classifier__colsample_bytree': None, 'classifier__feature_fraction': 0.5, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 5, 'classifier__min_child_samples': 10, 'classifier__num_leaves': 15}"
mlp_classifier,0.999372,0.844548,0.779443,0.81069,0.003415,0.81103,0.810376,0.889598,"{'classifier__activation': 'tanh', 'classifier__alpha': 1e-05, 'classifier__hidden_layer_sizes': (100,), 'classifier__learning_rate_init': 0.01, 'classifier__max_iter': 200, 'classifier__solver': 'adam'}"
