$Loan$ $Eligibility$ $Prediction$

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
from logging.handlers import TimedRotatingFileHandler
warnings.filterwarnings('ignore')

In [None]:
def setup_logger(name: str, log_filename: str | Path, level = logging.INFO) -> logging.Logger:
    ''' Setup a dedicated timedrotatingfilehandler logging system that logs information to both file and console

    Args: 
        name : logger name (e.g. EDA, preprocessing, feature_engineering)
        log_filename: Log output file
        level: Logging level (e.g. INFO, WARNING, ERROR, DEBUG)

    Examples:
        log = setup_logger(name="EDA",log_filename="logs/EDA_pipeline.log", level=logging.INFO)
        log.info("Dedicated logging system setup successful")
    '''
    log = logging.getLogger(name)
    # prevent adding handlers multiple times if handlers already exist
    if log.handlers:
        return log
    
    formatter = logging.Formatter(
        "%(asctime)s - %(levelname)s : %(message)s",
        datefmt='%Y-%m-%d %H:%M:%S'
        )
    # Time rotating file handler
    file_handler = TimedRotatingFileHandler(
        filename=log_filename,
        when='midnight',
        interval=1,
        backupCount=7
    )
    file_handler.suffix = "_%Y%m%d"
    file_handler.setFormatter(formatter)
    
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)

    log.propagate = False # don't propagate to root logger
    log.setLevel(level)

    log.addHandler(file_handler)
    log.addHandler(console_handler)
    
    return log

In [None]:
log = setup_logger(name='notebook_eda', log_filename='../logs/notebook_eda.log')

`Basic Descriptive Summary`

In [None]:
try:
    df = pd.read_csv('../data/raw/LEP.csv')
except FileNotFoundError:
    log.error('File not found! Check filepath and try again')
    raise
except Exception as e:
    log.error(f'Error parsing CSV file: {e}',exc_info=True)

In [None]:
df.head()

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
summary = df[numeric_cols[1:]].describe().T
summary['range'] = summary['max'] - summary['min']
summary['cv'] = round(summary['mean'] / summary['std'],4)
summary

In [None]:
df['Education'].value_counts().head(3).to_dict()

In [None]:
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
summary_data = []
for col in categorical_cols:
    summary_data.append({'column' : col,
    'unique' : df[col].nunique(),
    'most_frequent' : df[col].mode()[0] if len(df[col].mode()) > 0 else None,
    'most_frequent_count' : df[col].value_counts().iloc[0] if len(df[col]) > 0 else 0,
    'top_5_values' : df[col].value_counts().head().to_dict()
    })
summary_data

In [None]:
df.info()

In [None]:
log.info(f'Number of observations: {df.shape[0]}')

In [None]:
log.info(f'Number of features: {df.shape[1]}')

In [None]:
df.describe(exclude='object').T

In [None]:
df.describe(exclude=np.number).T

`Numerical Columns`

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
log.info(f'NUMERICAL COLUMNS')
log.info('='*30)
log.info(numeric_columns)
log.info('='*30)

In [None]:
log.info('='*50)
for i, col in enumerate(numeric_columns,1):
    log.info(f'{i}. {col:<20} | Min: {df[col].min():<7} | Max: {df[col].max()}')
log.info('='*50)

`Categorical Columns`

In [None]:
log.info('CATEGORICAL COLUMNS')
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
log.info('='*30)
log.info(categorical_cols)
log.info('='*30)

In [None]:
log.info('='*30)
for i, col in enumerate(categorical_cols,1):
    uniques = df[col].unique()
    log.info(f'{i}. {col:<15} | Unique : {df[col].nunique()} | Examples: {uniques[:6]}')
log.info('='*30)

`Missing Values`

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'missing_values' : missing,
    'missing_pct' : missing_pct.round(2)
})

log.info('MISSING VALUES')
log.info('='*30)
log.info(missing_df)
log.info('='*30)

`Duplicate Values`

In [None]:
duplicates = df[df.duplicated()]
log.info(f'DUPLICATED VALUES')
log.info('='*30)
log.info(f'Number of duplicated values : {len(duplicates)}')
log.info('='*30)

`Handling Outlier Values`

In [None]:
log.info('OUTLIER SUMMARY')
log.info('='*35)
for i,col in enumerate(numeric_columns[1:],1): # ignore customerID column - just an identifier
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    log.info(f'{i}. {col:<20} | Number of outliers : {len(outliers):<2} | Range ({lower_bound} - {upper_bound})')
log.info('='*35)

$Visualizations$

`Univariate Analysis`

In [None]:
# import matplotlib
# matplotlib.use('agg')

In [None]:
plt.style.use(style="seaborn-v0_8-darkgrid")
sns.set_context(context='notebook')

In [None]:
n_cols = len(numeric_cols[1:])
n_rows = (n_cols + 2) // 3

fig, axes = plt.subplots(n_rows, 3, figsize=(15,10))
axes = axes.flatten() if len(n_cols) > 1 else [axes]

for idx, col in enumerate(numeric_cols[1:]):
    try:
        sns.histplot(data=df, x=col, kde=True, ax=axes['idx'],color='purple', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(True, alpha=0.3)
    except Exception as e:
        log.exception(f'Error plotting distribution: {e}')

for idx in range(n_cols, len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.close()


In [None]:
log.info('='*50)
log.info('NUMERIC COLUMNS - VISUALIZATIONS')
log.info('='*50)
plt.figure(figsize=(18,10))
for i,col in enumerate(numeric_columns[1:],1):
    plt.subplot(2, 3, i)
    sns.histplot(data=df, x=col, color='purple', alpha=0.7,kde=True)
    plt.title(f'Distribution of {col}', fontsize=13, fontweight='bold')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)

plt.show()
log.info(f'Distribution of numeric columns plotted!')
plt.close()

In [None]:
log.info('='*50)
log.info('BOXPLOTS - OUTLIER DETECTION')
log.info('='*50)

plt.figure(figsize=(18,10))
for i,col in enumerate(numeric_columns[1:],1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, y=col, color='gold', linecolor='black')
    plt.title(f'Boxplot - {col}', fontsize=13, fontweight='bold')
    plt.grid(True, alpha=0.3)

plt.show()
log.info(f'Boxplots of numeric columns plotted!')
plt.close()

In [None]:
log.info('='*50)
log.info('CATEGORICAL COLUMNS')
log.info('='*50)

plt.figure(figsize=(15,10))
for i, col in enumerate(categorical_cols,1):
    plt.subplot(2, 3, i)
    ax = sns.countplot(data=df, x=col, gap=0.4, width=0.5, saturation=0.8, color='green')
    for container in ax.containers:
        ax.bar_label(container, label_type='edge')
    ax.set_title(f'{col}')
    ax.set_ylabel('Frequency')
    plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
log.info(f'Categorical columns plotting successful')
plt.close()

`Multivariate Analysis`

In [None]:
log.info('='*50)
log.info('CORRELATION HEATMAP')
log.info('='*50)

corr = df.corr(numeric_only=True, method='spearman')

plt.figure(figsize=(15,10))
sns.heatmap(data=corr,annot=True,fmt='.2f',cmap='Blues')
plt.title(f'Correlation heatmap', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()
log.info(f'Heatmap successfully plotted')
plt.close()

In [None]:
log.info('='*30)
log.info('TARGET VARIABLE')
log.info('='*30)

group_values = df['Loan_Status'].value_counts().to_dict()
for group, value in group_values.items():
    log.info(f'{group} : {value}')

`Confidence Interval`

In [None]:
from scipy import stats
def calculate_ci(data, confidence=0.95):
    """
    Calculate confidence interval for a numerical feature
    
    Args:
        data: array-like numerical data
        confidence: confidence level (default 0.95 for 95% CI)
    
    Returns:
        dict with mean, lower_bound, upper_bound, margin_of_error
    """
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)  # standard error
    margin = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    
    return {
        'mean': mean,
        'lower_bound': mean - margin,
        'upper_bound': mean + margin,
        'margin_of_error': margin,
        'sample_size': n
    }

log.info('='*60)
log.info('CONFIDENCE INTERVALS FOR NUMERICAL FEATURES')
log.info('='*60)

# Example: Calculate 95% CI for all numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'Customer_ID']  # exclude ID

for col in numeric_cols:
    ci = calculate_ci(df[col].dropna())
    log.info(f"\n{col}:")
    log.info(f"  Mean: {ci['mean']:.2f}")
    log.info(f"  95% CI: [{ci['lower_bound']:.2f}, {ci['upper_bound']:.2f}]")
    log.info(f"  Margin of Error: Â±{ci['margin_of_error']:.2f}")


In [None]:
from scipy.stats import mannwhitneyu, ttest_ind
def compare_groups_ttest(df, numeric_col, grouping_col, group1_val, group2_val):
    ''' 
    Perform independent t-test between two groups

    H0 : Average of group1 = Average of group2
    H1 : Average of group1 != Average of group2
    '''

    group1 = df[df[grouping_col] == group1_val][numeric_col].dropna()
    group2 = df[df[grouping_col] == group2_val][numeric_col].dropna()

    # check for normality (shapiro-wilk test)
    _, p_value1 = stats.shapiro(group1.sample(min(5000, len(group1))))
    _, p_value2 = stats.shapiro(group2.sample(min(5000, len(group2))))

    if p_value1 < 0.05 or p_value2 < 0.05:
        statistic, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
        test_type = 'mannwhitneyu (non-parametric)'

    else:
        statistic, p_value = ttest_ind(group1, group2, equal_var=False)
        test_type = "Welsh's t-test"

    return {
        'test_type' : test_type,
        'group1_mean' : group1.mean(),
        'group2_mean' : group2.mean(),
        'group1_std' : group1.std(),
        'group2_std' : group2.std(),
        'statistic' : statistic,
        'p_value' : p_value,
        'significant' : p_value < 0.05,
        'effect_size' : abs(group1.mean() - group2.mean()) / np.sqrt((group1.std()**2 + group2.std()**2) / 2)
    }

target_value = 'Loan_Status'
for col in numeric_cols[1:]:
    result = compare_groups_ttest(df, col, target_value, 'Y', 'N')
    # Y = loan approved
    # N = loan rejected
    log.info(f'\n{col} - {result['test_type']}')
    log.info(f'Approved mean: {result['group1_mean']:.2f} +/- ({result['group1_std']:.2f})')
    log.info(f'Rejected mean: {result['group2_mean']:.2f} +/- ({result['group2_std']:.2f})')
    log.info(f'Test statistic: {result['statistic']:.2f}')
    log.info(f'P-Value : {result['p_value']:.2f}')
    log.info(f"Effect size (Cohen's d) : {result['effect_size']:.3f}")

    if result['significant']:
        log.info(f'SIGNIFICANT difference: p_value = {result['p_value']:.2f}  < 0.05')
    else:
        log.info(f'NO significant difference: p_value = {result['p_value']:.2f} > 0.05')

In [None]:
from scipy.stats import chi2_contingency
def chi_square_test(df, cat_cols, target_col):
    '''Perform chi-square of independence
    
    H0 : Categorical feature is independent of target
    H1 : Categorical feature is associated of target
    '''

    contigency_table = pd.crosstab(df[cat_cols], df[target_col])
    chi2, p_value, dof, expected = chi2_contingency(contigency_table)

    # Craimer's V for effect size
    n = contigency_table.sum().sum()
    min_dim = min(contigency_table.shape) - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))

    return {
        'chi2' : chi2,
        'p_value' : p_value,
        'dof' : dof,
        'cramers_v' : cramers_v,
        'significant' : p_value < 0.05,
        'contigency_table' : contigency_table
    }

log.info('='*60)
log.info('CHI-SQUARE TESTS: CATEGORICAL FEATURES vs TARGET')
log.info('='*60)


cat_cols = [col for col in categorical_cols if col != target_value]

for col in cat_cols:
    result = chi_square_test(df, col, target_value)

    log.info(f'\n{col}')
    log.info(f'Chi square : {result['chi2']:.4f}')
    log.info(f'p_value : {result['p_value']:.4f}')
    log.info(f"Cramer's V : {result['cramers_v']:.4f}")
    
    if result['significant']:
        log.info(f'SIGNIFICANT difference: p_value = {result['p_value']:.2f}  < 0.05')
    else:
        log.info(f'NO significant difference: p_value = {result['p_value']:.2f} > 0.05')
        
    print(f'\nContigency Table')
    print(f'{result['contigency_table']}')

In [None]:
from scipy.stats import kruskal
def multi_group_comparison(df, grouping_col, numeric_col):
    ''' 
    Compare numerical feature across multiple categorical groups

    H0 : All group means are equal
    H1 : At least one group mean is different
    '''
    groups = [group[numeric_col].dropna() for name, group in df.groupby(grouping_col)]

    # check for normality
    normality_p_values = [stats.shapiro(g.sample(min(5000, len(g))))[1] for g in groups]

    if all(p >= 0.05 for p in normality_p_values):
        statistic, p_value = stats.f_oneway(*groups)
        test_type = 'one-way ANOVA test'
    
    else:
        statistic, p_value = kruskal(*groups)
        test_type = 'Kruskal-Wallis H-test'

    return {
        'test_type' : test_type,
        'statistic' : statistic,
        'p_value' : p_value,
        'significant' : p_value < 0.05,
        'groupby_means' : df.groupby(grouping_col)[numeric_col].mean().to_dict()
    }

log.info('='*60)
log.info('MULTI-GROUP COMPARISONS (e.g., Income across Education Levels)')
log.info('='*60)

if 'Education' and 'Applicant_Income' in numeric_columns:
    result = multi_group_comparison(df, 'Education', 'Applicant_Income')

    log.info(f'\nIncome Across Education levels : {result['test_type']}')
    log.info(f'Test statistics : {result['statistic']:.2f}')
    log.info(f'p_value : {result['p_value']:.2f}')
    

    log.info(f'\nGroup Means')
    for group, mean in result['groupby_means'].items():
        log.info(f'{group} : {mean:.2f}')

    if result['significant']:
        log.info(f'SIGNIFICANT difference: p_value = {result['p_value']:.2f}  < 0.05')
    else:
        log.info(f'NO significant difference: p_value = {result['p_value']:.2f} > 0.05')       
