In [1]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:


# Load data
df = pd.read_csv("../data/processed/accepted_cleaned.csv")
print(df.columns)

df['interest_rate'] = pd.to_numeric(df['int_rate'].astype(str).str.replace('%', ''), errors='coerce') / 100

# Add log_total_balance if missing
if 'log_total_balance' not in df.columns and 'total_balance' in df.columns:
    df['log_total_balance'] = np.log1p(df['total_balance'])

# Drop rows with missing values
required_columns = [
    'credit_utilization', 'credit_history_years', 'total_delinquency',
    'recent_inquiries', 'credit_mix', 'log_total_balance',
    'fico_score', 'interest_rate'
]
df = df.dropna(subset=required_columns)

# Features and targets
X = df[['credit_utilization', 'credit_history_years', 'total_delinquency',
        'recent_inquiries', 'credit_mix', 'log_total_balance']]
y = df[['fico_score', 'interest_rate']]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Multi-task Ridge regression
model = MultiOutputRegressor(Ridge(alpha=1.0))
model.fit(X_scaled, y)
y_pred = model.predict(X_scaled)

# Evaluation
for i, target in enumerate(['fico_score', 'interest_rate']):
    print(f"\nTarget: {target}")
    print(f"MAE: {mean_absolute_error(y.iloc[:, i], y_pred[:, i]):.4f}")
    print(f"R²: {r2_score(y.iloc[:, i], y_pred[:, i]):.4f}")


Index(['fico_score', 'credit_utilization', 'credit_history_years', 'int_rate',
       'total_delinquency', 'recent_inquiries', 'credit_mix', 'total_balance'],
      dtype='object')

Target: fico_score
MAE: 20.3531
R²: 0.3737

Target: interest_rate
MAE: 0.0350
R²: 0.1360


In [3]:

# Load raw data
df = pd.read_csv("../data/accepted_2007_to_2018Q4.csv", low_memory=False)

# --- Select required columns ---
keep_cols = [
    'fico_range_low', 'fico_range_high', 'int_rate', 'revol_util',
    'earliest_cr_line', 'issue_d', 'inq_last_6mths', 'delinq_2yrs',
    'open_acc', 'loan_amnt', 'term', 'grade', 'sub_grade',
    'annual_inc', 'dti', 'emp_length', 'purpose', 'home_ownership'
]
df = df[keep_cols].copy()

# --- Clean and engineer features ---
df['fico_score'] = (df['fico_range_low'] + df['fico_range_high']) / 2
df['interest_rate'] = pd.to_numeric(df['int_rate'].astype(str).str.replace('%', ''), errors='coerce') / 100
df['credit_utilization'] = pd.to_numeric(df['revol_util'].astype(str).str.replace('%', ''), errors='coerce') / 100

# Dates and credit history
df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')
df['credit_history_years'] = ((df['issue_d'] - df['earliest_cr_line']).dt.days / 365).round(2)

# Rename for consistency
df.rename(columns={
    'inq_last_6mths': 'recent_inquiries',
    'delinq_2yrs': 'total_delinquency',
    'open_acc': 'credit_mix'
}, inplace=True)

# Employment length (clean and encode as number of years)
df['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float)

# Log transform income
df['log_annual_inc'] = np.log1p(df['annual_inc'])

# Log transform loan amount
df['log_loan_amnt'] = np.log1p(df['loan_amnt'])

# Log transform DTI
df['log_dti'] = np.log1p(df['dti'])

# Drop rows with key missing values
final_cols = [
    'fico_score', 'interest_rate', 'credit_utilization', 'credit_history_years',
    'total_delinquency', 'recent_inquiries', 'credit_mix', 'log_loan_amnt',
    'log_annual_inc', 'log_dti', 'term', 'grade', 'sub_grade', 'purpose', 'home_ownership'
]
df = df.dropna(subset=final_cols)

# Save for next step
df.to_csv("../data/processed/accepted_enriched.csv", index=False)
print(f"Saved {len(df)} enriched records to accepted_enriched.csv")


  df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
  df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')
  result = getattr(ufunc, method)(*inputs, **kwargs)


Saved 2257158 enriched records to accepted_enriched.csv


In [4]:



# Load enriched dataset
df = pd.read_csv("../data/processed/accepted_enriched.csv")

# Define feature sets
numerical_features = [
    'credit_utilization', 'credit_history_years',
    'total_delinquency', 'recent_inquiries', 'credit_mix',
    'log_loan_amnt', 'log_annual_inc', 'log_dti'
]
categorical_features = ['term', 'grade', 'sub_grade', 'purpose', 'home_ownership']



# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

# Ridge regression inside multi-output wrapper
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(Ridge(alpha=1.0)))
])

# Drop rows with infinite values (from log transforms, etc.)
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any missing values in features or targets
X = df[numerical_features + categorical_features]
df = df.dropna(subset=numerical_features + categorical_features + ['fico_score', 'interest_rate'])

# Recreate X and y after dropping
X = df[numerical_features + categorical_features]
y = df[['fico_score', 'interest_rate']]

model.fit(X, y)

# Predict
y_pred = model.predict(X)

# Evaluation
print("Multi-Task Ridge (Enhanced Features)")
for i, target in enumerate(['fico_score', 'interest_rate']):
    mae = mean_absolute_error(y.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y.iloc[:, i], y_pred[:, i])
    print(f"\nTarget: {target}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")


Multi-Task Ridge (Enhanced Features)

Target: fico_score
MAE: 18.8510
R²: 0.4607

Target: interest_rate
MAE: 0.0070
R²: 0.9558
