**Table of contents**<a id='toc0_'></a>    
- [Feature selection](#toc1_1_)    
    - [Principle Component Analysis](#toc1_1_1_)    
    - [Feature importance from tree-based models (e.g., Random Forest)](#toc1_1_2_)    
- [model](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import display, HTML
import sys
import os

import warnings
warnings.filterwarnings("ignore")
sys.path.append('../src/')

import utility
from utility import *
from sklearn.preprocessing import LabelEncoder


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

display(HTML("<style>.container { width:50% !important; }</style>"))
#display(HTML('<style>.output { max-width:800px !important; }</style>')) # control output width
display(HTML("<style>.container { width:130% !important; }</style>"))

pd.set_option('display.max_columns', None)   # Show number of columns, None means show all columns
pd.set_option('display.max_rows', 50)        # show top and bottom 15, middle with ......
pd.set_option('display.width', None)         # Set the width of the display
pd.set_option('display.max_colwidth', None)  # Show full content of each column

In [2]:
term_deposit_subscription = pd.read_csv('https://raw.githubusercontent.com/KevinJianLin/term_deposit_subscription/refs/heads/main/data/train.csv')
data_set = term_deposit_subscription.copy()
data_set.head(5)
data_profile = data_profiling(data_set)
#data_profile()
# data_profile.float_column
# data_profile.int_column
# data_profile.rest_columns
# data_profile.cat_col
target_variable = 'subscribed' 

# Prepare the feature matrix and target
X = data_set[data_profile.int_column]
y = data_set[target_variable]
label_encoder = LabelEncoder()

# Transform 'yes'/'no' to 1/0
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,unknown,no,1933,no,no,telephone,19,nov,44,2,-1,0,unknown,no
1,40576,31,unknown,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,unknown,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,unknown,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,unknown,no


In [3]:
data_preprocessor_1 = ColumnTransformer(transformers = [
                                   #         ('text', TfidfVectorizer(max_features=500), text_col),
                                            ('float_col',StandardScaler(),data_profile.int_column),
                                        #    ('cat_col', OneHotEncoder(), cat_col),
                                            ]) 

label_encoder = LabelEncoder()
data_set[target_variable] = label_encoder.fit_transform(data_set[target_variable])


## <a id='toc1_1_'></a>[Feature selection](#toc0_)

### <a id='toc1_1_1_'></a>[Principle Component Analysis](#toc0_)

### <a id='toc1_1_2_'></a>[Feature importance from tree-based models (e.g., Random Forest)](#toc0_)
Permutation importance
SHAP (SHapley Additive exPlanations) values
Recursive feature elimination
Lasso regularization

### 

# <a id='toc2_'></a>[model](#toc0_)

In [4]:
best_model ={}
results = {}

# Loop through models and apply GridSearchCV
for model_name, model_infor in model_parameters_classification.items():


    # Create a pipeline for each model
    pipeline = Pipeline([('preprocessor',data_preprocessor_1),
                         ('classifier',model_infor['model'])])
    
    # Initialize GridSearchCV with model and parameter grid
    grid_search = GridSearchCV(estimator=pipeline,
                                param_grid = model_infor['params'],
                                cv = 5, # 5 fold cv
                                scoring = 'accuracy', # f1 etc
                                verbose =0, # not to display output
                                n_jobs = -1 # use all of cores
                                )
    model_trained = grid_search.fit(X_train, y_train)

    best_model[model_name] = model_trained.best_estimator_
    #print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

    y_pred       = model_trained.predict(X_test)
    y_pred_proba = model_trained.predict_proba(X_test)[:, 1] # Extract probabilities for class 1



    # Basic Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred) 
    recall = recall_score(y_test, y_pred) # 
    f1 = f1_score(y_test, y_pred) # 

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Probabilistic Metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    logloss = log_loss(y_test, y_pred_proba)

    # Advanced Metrics
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    results[model_name] = {'accuracy': accuracy,
                           'precision': precision,
                           'recall':recall,
                           'f1': f1,
                           'logloss': logloss,
                           'mcc': mcc,
                           'kappa': kappa,
                           'balanced_acc': balanced_acc,
                           'Best Params': grid_search.best_params_
                           }

    results_df = pd.DataFrame(results).T  # Transpose for readability


    # print(f"Classification Report for {model_name}:\n")
    # print(classification_report(y_test, y_pred, target_names = label_encoder.classes_))

    #print(model_infor['model'])
results_df   

NameError: name 'accuracy_score' is not defined