In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import spearmanr
from collections import defaultdict

In [2]:
# Function to compute Gini coefficient including null values
def gini_with_nulls(feature, target):
    df = pd.DataFrame({'feature': feature, 'target': target})
    df = df.dropna()
    values = df['feature'].values
    target_values = df['target'].values
    
    # Rank transformation for Gini calculation
    sorted_indices = np.argsort(values)
    sorted_target = target_values[sorted_indices]
    cum_target = np.cumsum(sorted_target)
    cum_target_sum = cum_target[-1]
    
    if cum_target_sum == 0:
        return 0
    
    gini = (np.sum(cum_target) / cum_target_sum - (len(sorted_target) + 1) / 2) / len(sorted_target)
    return gini * 2

In [3]:
# Step 1: Univariate Analysis
def univariate_analysis(df, num_vars, cat_vars):
    results = {}
    
    for var in num_vars + cat_vars:
        non_null_fill = df[var].notnull().mean()
        zero_fill = (df[var].isnull() | (df[var] == 0)).mean()
        percentiles = df[var].dropna().quantile([0.25, 0.50, 0.75]).to_dict()
        
        results[var] = {
            'fill_rate': non_null_fill,
            'zero_fill_rate': zero_fill,
            '25%': percentiles.get(0.25, np.nan),
            '50%': percentiles.get(0.50, np.nan),
            '75%': percentiles.get(0.75, np.nan)
        }
    
    results_df = pd.DataFrame(results).T
    results_df = results_df[results_df['zero_fill_rate'] < 0.95]  # Drop nearly empty variables
    return results_df

In [4]:
# Step 2: Bivariate Analysis - Compute Gini
def bivariate_analysis(df, num_vars, cat_vars, target):
    gini_results = {}
    
    for var in num_vars + cat_vars:
        gini_score = gini_with_nulls(df[var], df[target])
        gini_results[var] = gini_score
    
    gini_df = pd.DataFrame.from_dict(gini_results, orient='index', columns=['gini'])
    return gini_df

In [5]:
# Step 3: Truncation Testing
def truncation_selection(df, num_vars, target):
    trunc_levels = [98, 95, 90, 85]
    best_truncation = {}
    
    for var in num_vars:
        best_gini = -1
        best_data = df[var]
        
        for trunc in trunc_levels:
            threshold = np.percentile(df[var].dropna(), trunc)
            truncated_data = df[var].clip(upper=threshold)
            gini_score = gini_with_nulls(truncated_data, df[target])
            
            if gini_score > best_gini:
                best_gini = gini_score
                best_data = truncated_data
        
        best_truncation[var] = best_data
    
    df_truncated = df.copy()
    for var in best_truncation:
        df_truncated[var] = best_truncation[var]
    
    return df_truncated