In [1]:
import pandas as pd

df = pd.read_parquet('final_dataset_enhanced.parquet', engine='fastparquet')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,estimated_housing_payment,dti_adjusted,ltv_proxy,composite_risk_score,credit_history_length_months,ZHVI_missing,ZHVI_lag_1_missing,ZHVI_lag_3_missing,ZHVI_lag_6_missing,ZHVI_lag_12_missing
0,30000,0,22.35,1151.16,3,19,5,1,100000.0,1,...,2292.48191,57.969783,0.052345,0.4362,82,0,0,0,0,0
1,40000,1,16.14,975.71,2,13,10,1,45000.0,2,...,421.048347,61.757956,0.380004,0.3996,113,0,0,0,0,0
2,20000,0,7.56,622.68,0,2,1,1,100000.0,0,...,1577.879733,37.854557,0.050701,0.151233,237,0,0,0,0,0
3,4500,0,11.31,147.99,1,7,1,5,38500.0,0,...,1155.6257,40.659502,0.015576,0.166133,179,0,0,0,0,0
4,8425,0,27.27,345.18,4,24,3,1,450000.0,2,...,2262.750343,18.404001,0.014893,0.4579,253,0,0,0,0,0


In [2]:
df.shape

(1306387, 151)

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE

In [7]:
# Quick check to ensure no missing values remain
assert df.isnull().sum().sum() == 0, "Warning: Missing values found!"
print(" Dataset loaded and complete (0 NaN).")

 Dataset loaded and complete (0 NaN).


In [4]:
# 2. Temporal Preparation & Type Cleaning
if 'issue_d' in df.columns:
    # Convert to datetime if not already done
    df['issue_d'] = pd.to_datetime(df['issue_d'])
    
    # Chronological sort (Important for financial data)
    df = df.sort_values('issue_d')
    
    print(f"Time period covered: {df['issue_d'].min()} to {df['issue_d'].max()}")
    
    # Save the date aside for reference
    df_dates = df['issue_d']
    df_model = df.drop(columns=['issue_d'])
else:
    print(" Column 'issue_d' not found.")
    df_model = df.copy()

#  Keep only numeric columns 
# This automatically eliminates any remaining date or text columns
X_numeric = df_model.select_dtypes(include=['number'])

# Check which columns were dropped (for information)
dropped_cols = set(df_model.columns) - set(X_numeric.columns)
if len(dropped_cols) > 0:
    print(f" Non-numeric columns dropped automatically: {list(dropped_cols)}")

# 3. Split 80% Train / 20% Test (Chronological)
split_idx = int(len(X_numeric) * 0.80)

# Separate Target and Features
X = X_numeric.drop(columns=['loan_status_binary'])
y = X_numeric['loan_status_binary']

X_train = X.iloc[:split_idx].copy()
X_test = X.iloc[split_idx:].copy()
y_train = y.iloc[:split_idx].copy()
y_test = y.iloc[split_idx:].copy()

print(f"\nTrain set : {X_train.shape}")
print(f"Test set  : {X_test.shape}")

Time period covered: 2007-06-01 00:00:00 to 2018-12-01 00:00:00
 Non-numeric columns dropped automatically: ['earliest_cr_line']

Train set : (1045109, 148)
Test set  : (261278, 148)


In [5]:
from sklearn.ensemble import IsolationForest

# Outliers detection
print("Training Isolation Forest on the Train Set")

iso = IsolationForest(contamination=0.01, random_state=42, n_jobs=-1)
iso.fit(X_train)

print("Creating anomaly features")

# Calculating scores (without modifying X_train just yet)
train_scores = iso.decision_function(X_train)
test_scores = iso.decision_function(X_test)

# Calculating flags 
train_flags = iso.predict(X_train)
test_flags = iso.predict(X_test)

# Now we can add columns without problem
X_train['anomaly_score'] = train_scores
X_test['anomaly_score'] = test_scores

X_train['is_outlier'] = train_flags
X_test['is_outlier'] = test_flags

print("Features 'anomaly_score' and 'is_outlier' added successfully.")
print(f"Average anomaly score (Train): {X_train['anomaly_score'].mean():.4f}")

Training Isolation Forest on the Train Set
Creating anomaly features
Features 'anomaly_score' and 'is_outlier' added successfully.
Average anomaly score (Train): 0.1065


In [6]:
# 4. Safety Clipping (Handling Data Errors only)
# Decision Trees handle skewness well, but we remove impossible/error values
# by clipping at the 99.9th percentile.

cols_financial = ['annual_inc', 'revol_bal', 'tot_cur_bal', 'total_acc', 'loan_amnt', 'installment']

print("Applying Safety Clipping (99.9%)")

for col in cols_financial:
    if col in X_train.columns:
        # Calculate limit on Train
        upper_limit = X_train[col].quantile(0.999)
        
        # Apply to Train and Test
        X_train.loc[X_train[col] > upper_limit, col] = upper_limit
        X_test.loc[X_test[col] > upper_limit, col] = upper_limit
        
        print(f"  - {col}: clipped at {upper_limit:.2f}")

print(" Extreme anomalies handled")

Applying Safety Clipping (99.9%)
  - annual_inc: clipped at 550000.00
  - revol_bal: clipped at 265443.78
  - tot_cur_bal: clipped at 1198550.51
  - total_acc: clipped at 79.00
  - loan_amnt: clipped at 40000.00
  - installment: clipped at 1321.87
 Extreme anomalies handled


  X_train.loc[X_train[col] > upper_limit, col] = upper_limit
  X_test.loc[X_test[col] > upper_limit, col] = upper_limit


In [7]:
# 5. Drop Highly Correlated Features (> 0.95)
# Removing redundancy speeds up training and makes the model lighter
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(f"\nDropping {len(to_drop)} redundant features")
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)

print(f"Features remaining: {X_train.shape[1]}")


Dropping 45 redundant features
Features remaining: 105


In [8]:
# Infinite Value Correction 
# Division by zero during Feature Engineering may have created 'inf' values.

print("Checking and correcting infinite values")

def replace_infinite_values(df):
    # Count infinite values
    inf_count = np.isinf(df.select_dtypes(include=np.number)).sum().sum()
    
    if inf_count > 0:
        print(f"   {inf_count} infinite values detected. Replacing...")
        
        # Replace +/- inf with NaN
        df = df.replace([np.inf, -np.inf], np.nan)
        
        # Replace NaN with the column's max value
        for col in df.columns:
            if df[col].isnull().any():
                # Fill with the maximum observed value
                fill_val = df[col].max()
                df[col] = df[col].fillna(fill_val)
                print(f"    - Column {col} corrected (replaced by {fill_val:.2f})")
    else:
        print("No infinite values found.")
        
    return df

# Apply to Train and Test sets
X_train = replace_infinite_values(X_train)
X_test = replace_infinite_values(X_test)

print("Data ready for Scaling.")

Checking and correcting infinite values
   22 infinite values detected. Replacing...
    - Column dti_adjusted corrected (replaced by 874209.42)
   280 infinite values detected. Replacing...
    - Column dti_adjusted corrected (replaced by 5476278.84)
Data ready for Scaling.


In [9]:
# 6. Scaling (RobustScaler)
# Critical for SMOTE to calculate distances correctly
scaler = RobustScaler()

# Fit on Train, Transform on Train and Test
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

print(" Data Scaled.")

 Data Scaled.


In [10]:
from imblearn.over_sampling import SMOTE

# 7. SMOTE (On train only)
print(f"\nTarget distribution before SMOTE:\n{y_train.value_counts(normalize=True)}")

# Strategy 0.5 is good for Trees 
smote = SMOTE(sampling_strategy=0.5, random_state=42)

print("Running SMOTE")
X_train_final, y_train_final = smote.fit_resample(X_train_scaled, y_train)

print(f"Target distribution after SMOTE:\n{y_train_final.value_counts(normalize=True)}")
print(f"Final Train shape: {X_train_final.shape}")


Target distribution before SMOTE:
loan_status_binary
0    0.803019
1    0.196981
Name: proportion, dtype: float64
Running SMOTE
Target distribution after SMOTE:
loan_status_binary
0    0.666667
1    0.333333
Name: proportion, dtype: float64
Final Train shape: (1258863, 105)


In [20]:
# 8. Save for Modeling
import os
if not os.path.exists('data'):
    os.makedirs('data')

# Save features (X) in Parquet for speed
X_train_final.to_parquet('C:\\ESILV A4\\ESILV A4 DIA\\Machine Learning\\Projet ML\\X_train_tree_ready.parquet', engine='fastparquet')
X_test_scaled.to_parquet('C:\\ESILV A4\\ESILV A4 DIA\\Machine Learning\\Projet ML\\X_test_tree_ready.parquet', engine='fastparquet')

# Save targets (y) in CSV 
y_train_final.to_csv('C:\\ESILV A4\\ESILV A4 DIA\\Machine Learning\\Projet ML\\y_train_tree_ready.csv', index=False)
y_test.to_csv('C:\\ESILV A4\\ESILV A4 DIA\\Machine Learning\\Projet ML\\y_test_tree_ready.csv', index=False)

print("\n Data ready for training")
print("Files created ")


 Data ready for training
Files created 
