In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ==== H√†m t√≠nh VIF ====
def compute_vif(df):
    vif_df = pd.DataFrame()
    vif_df["feature"] = df.columns
    vif_df["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_df

# ==== Ph√¢n lo·∫°i bi·∫øn ====
def get_feature_types(df):
    continuous = [col for col in df.columns if df[col].nunique() > 20]
    binary = [col for col in df.columns if df[col].nunique() == 2]
    onehot = [col for col in df.columns if 3 <= df[col].nunique() <= 20]
    rare = [col for col in df.columns if df[col].value_counts(normalize=True).min() < 0.01]
    return continuous, binary, onehot, rare
# For logistic regression feature selection
#Step 1: Variance Threshold by feature type
def variance_threshold_by_type(df):
    continuous, binary, onehot, rare = get_feature_types(df)

    print("Continuous:", continuous)
    print("Binary:", binary)
    print("One-hot:", onehot)
    print("Rare:", rare)

    # Binary: threshold 1%-5%
    selector_binary = VarianceThreshold(threshold=0.01)
    df_binary = pd.DataFrame(selector_binary.fit_transform(df[binary]),
                             columns=np.array(binary)[selector_binary.get_support()])

    # One-hot encoded: threshold 0.0005‚Äì0.005
    selector_onehot = VarianceThreshold(threshold=0.0005)
    df_onehot = pd.DataFrame(selector_onehot.fit_transform(df[onehot]),
                             columns=np.array(onehot)[selector_onehot.get_support()])

    # Continuous gi·ªØ nguy√™n
    df_cont = df[continuous]

    # Rare category ‚Üí thay b·∫±ng ‚ÄúRare‚Äù
    df_rare = df.copy()
    for col in rare:
        freq = df[col].value_counts(normalize=True)
        rare_labels = freq[freq < 0.01].index
        df_rare[col] = df_rare[col].replace(rare_labels, "Rare")

    return pd.concat([df_cont, df_binary, df_onehot], axis=1)
#Step 2: Calculate correlations
def compute_correlations(df, target):
    corr = df.corrwith(target)
    corr = corr.abs().sort_values(ascending=False)
    return corr
# Step 3: Remove high VIF features
def remove_high_vif(df, threshold=10):
    df_copy = df.copy()
    while True:
        vif = compute_vif(df_copy)
        max_vif = vif["VIF"].max()
        if max_vif > threshold:
            col_to_remove = vif.loc[vif["VIF"].idxmax(), "feature"]
            print(f"Removing {col_to_remove} with VIF={max_vif:.2f}")
            df_copy = df_copy.drop(columns=[col_to_remove])
        else:
            break
    return df_copy
#Step 4: Select K best features
def statistical_selection(df, target, k=20):
    selector_chi2 = SelectKBest(score_func=chi2, k=k)
    selector_chi2.fit(df, target)

    selector_mi = SelectKBest(score_func=mutual_info_classif, k=k)
    selector_mi.fit(df, target)

    features_chi2 = df.columns[selector_chi2.get_support()]
    features_mi = df.columns[selector_mi.get_support()]

    selected = set(features_chi2).union(set(features_mi))
    return list(selected)
# Step 5: L1 Regularization
def l1_feature_selection(df, target, C=0.1):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)

    model = LogisticRegression(penalty="l1", solver="liblinear", C=C)
    model.fit(X_scaled, target)

    coef = pd.Series(model.coef_[0], index=df.columns)
    selected = coef[coef != 0].index.tolist()

    return selected, coef
# Feature Selection for tree-based models
#Step 1: Variance Threshold by feature type
def tree_model_variance(df):
    return variance_threshold_by_type(df)
#Step 2: Feature Selection 
def tree_model_feature_selection(df, target, k=30):
    df1 = tree_model_variance(df)
    selected_stat = statistical_selection(df1, target, k=k)
    final_df = df1[selected_stat]
    return final_df


In [37]:
def logistic_regression_feature_selection(df, target):
    print("\n=== B1: Variance Threshold ===")
    df1 = variance_threshold_by_type(df)

    print("\n=== B2: Correlation with TARGET ===")
    corr = compute_correlations(df1, target)
    print(corr.head(20))

    print("\n=== B3: Remove high VIF ===")
    df2 = remove_high_vif(df1)

    print("\n=== B4: Chi2 + Mutual Information ===")
    selected_stat = statistical_selection(df2, target, k=20)
    df3 = df2[selected_stat]

    print("\n=== B5: L1 Logistic Regression ===")
    selected_l1, coef_l1 = l1_feature_selection(df3, target)

    final_features = selected_l1
    print("\nüéâ FEATURES CU·ªêI C√ôNG:", final_features)

    return final_features


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../raw_data/train.csv')

In [7]:
import importlib
import pipeline
importlib.reload(pipeline)
import sys
sys.path.append("..")
from processing.function import *
from pipeline import *
def load_preprocessing_pipeline(filepath: str):
    """
    Load preprocessing pipeline from disk
    
    Parameters:
    -----------
    filepath : str
        Path to load pipeline from
        
    Returns:
    --------
    pipeline : sklearn.pipeline.Pipeline
        Loaded preprocessing pipeline
    """
    pipeline = joblib.load(filepath)
    print(f"Preprocessing pipeline loaded from {filepath}")
    return pipeline
pipeline_path = '../models/preprocessing_pipeline.pkl'

try:
    preprocessing_pipeline = load_preprocessing_pipeline(pipeline_path)
except FileNotFoundError:
    raise FileNotFoundError(
        f"Preprocessing pipeline not found at '{pipeline_path}'. "
                f"Please ensure the pipeline is saved at this location."
        )

Preprocessing pipeline loaded from ../models/preprocessing_pipeline.pkl


In [59]:

df = df.copy()
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['TARGET'])

In [60]:
train_df.shape

(172206, 75)

In [64]:
train_processed = preprocessing_pipeline.fit_transform(train_df)
val_processed = preprocessing_pipeline.transform(val_df)

Label encoding: CODE_GENDER (2 categories)
Label encoding: FLAG_OWN_CAR (2 categories)
Label encoding: FLAG_OWN_REALTY (2 categories)
One-hot encoding: NAME_CONTRACT_TYPE (2 categories)
One-hot encoding: NAME_INCOME_TYPE (8 categories)
One-hot encoding: NAME_FAMILY_STATUS (6 categories)
One-hot encoding: NAME_HOUSING_TYPE (6 categories)
Ordinal encoding: NAME_EDUCATION_TYPE with order ['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree']
frequency map for ORGANIZATION_TYPE: {'Business Entity Type 3': 0.22190864429810808, 'XNA': 0.18050474431785188, 'Self-employed': 0.12360777208691916, 'Other': 0.054382541839424876, 'Medicine': 0.03664796813119171, 'Business Entity Type 2': 0.03456906263428684, 'Government': 0.033663170853512656, 'School': 0.029075641963694644, 'Trade: type 7': 0.02505719893615786, 'Kindergarten': 0.022519540550271187, 'Construction': 0.021392982822898157, 'Business Entity Type 1': 0.01944763829367153, 'Transpo