# 2. Logistic Regression

We load our and display output from data preparation section and continue to our first model, based on logistic regression. (See notebook "loan_default_data_preparation" for details on the data generation and distributions)


In [2]:
import pandas as pd
import numpy as np
import pickle
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load the data
with open('data/loan_data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

df = loaded_data['df']
metadata_df = loaded_data['metadata_df']
numeric_features = loaded_data['numeric_features']  # 
categorical_features = loaded_data['categorical_features']
target_feature = loaded_data['target_feature']

# Display metadata of the synthetic data distribution
# Style the DataFrame for better readability
styled_df = metadata_df.style.set_properties(**{'text-align': 'left'})
styled_df.set_table_styles([dict(selector = 'th', props=[('text-align', 'left')])])

# Display the styled DataFrame
display(styled_df)




Unnamed: 0,Feature,Skewness Description,Mean/Mode Assumptions and Distribution Details
0,Income,"Positively skewed. Higher concentration of individuals with lower incomes, with fewer high-income earners.","Mean: €4000, Distribution: Log-normal with σ = 0.5"
1,Loan_Amount,"Positively skewed. Loan amounts are concentrated towards lower to moderate values, with fewer large loan amounts.","Mean: €175,000, Distribution: Log-normal with σ = 0.5"
2,Credit_Score,"Negatively skewed. Most people have good to excellent credit scores, with fewer individuals having very low credit scores.","Mean: 700, Mode: 750, Distribution: Normal (reversed) with μ = 100 and σ = 50"
3,Employment_Status,"Binary distribution. Higher proportion of the population being employed, heavily weighted towards employment.","Mode: Employed, Distribution: Binary with p = 0.9"
4,Debt_to_Income,"Positively skewed. More individuals with lower ratios, but significant instances of high debt relative to income.","Mean: 0.6, Distribution: Beta (α = 2, β = 5) scaled to [0, 1.2]"
5,Loan_Term,Discrete uniform. Distributed across specific intervals with peaks at common loan terms like 20 and 30 years.,"Mode: 20 years, Distribution: Discrete with p = [0.1, 0.2, 0.3, 0.1, 0.3]"
6,Age,"Approximately normal. Centered around peak working ages (30-45 years), with fewer young and old applicants.","Mean: 35, Distribution: Normal with μ = 35 and σ = 10, clipped to [18, 75]"
7,Home_Ownership,"Binary distribution. Higher proportion of the population owning homes, heavily weighted towards ownership.","Mode: Own, Distribution: Binary with p = 0.7"
8,Default,"Positively skewed. Default rates are typically low, with a small percentage representing defaults.","Mean: 0.15, Distribution: Top 15% of risk scores"


In [3]:
# Combine all features
all_features = numeric_features + categorical_features + [target_feature]

# Filter the dataframe
df_filtered = df[all_features]

df_filtered.head()

Unnamed: 0,Income,Loan_Amount,Credit_Score,Debt_to_Income,Age,Employment_Status,Home_Ownership,Loan_Term,Default
0,5127.670373,183807.505965,848.470271,0.331069,39,Employed,Own,30,0
1,3732.813394,169479.480329,850.0,0.193608,21,Employed,Rent,30,0
2,5529.728012,281654.558991,850.0,0.463659,44,Employed,Own,25,0
3,8566.072041,376606.751901,850.0,0.433488,44,Employed,Rent,15,0
4,3558.067903,246708.985583,850.0,0.222574,35,Employed,Own,20,0


### 2.2 Form logistic regression model 
We prepare our data and train our model

In [4]:
random_seed = 42

# Separate features (X) and target variable (y)
X = df_filtered.drop(columns=[target_feature])
y = df_filtered[target_feature]

# Encoding categorical features
le = LabelEncoder()
for col in categorical_features:
    X[col] = le.fit_transform(X[col])

# Encoding target variable
y = le.fit_transform(y)

# Standardizing the numeric features
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

X.describe()

Unnamed: 0,Income,Loan_Amount,Credit_Score,Debt_to_Income,Age,Employment_Status,Home_Ownership,Loan_Term
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2.042633e-16,1.628225e-16,3.669021e-16,2.586731e-16,-2.006066e-15,0.10184,0.29772,2.09972
std,1.00001,1.00001,1.00001,1.00001,1.00001,0.302441,0.45726,1.131778
min,-1.703831,-1.695929,-13.29898,-1.782191,-1.930608,0.0,0.0,0.0
25%,-0.6985857,-0.6925444,0.3191564,-0.7797536,-0.7193243,0.0,0.0,1.0
50%,-0.2190669,-0.2197733,0.3191564,-0.1279533,-0.05862394,0.0,0.0,2.0
75%,0.4488692,0.4417927,0.3191564,0.6476407,0.7121931,0.0,1.0,3.0
max,13.71765,9.325957,0.3191564,4.06083,4.235928,1.0,1.0,4.0


In [5]:
# Split into training + validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_seed, stratify=y
)

# Split the training + validation set into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=random_seed, stratify=y_train_val
)

In [6]:
# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

### 2.3 Evaluate logistic regression model 
We evaluate our model with the validation set

In [7]:
# Predictions on the validation set
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1] # probabilities for positive class (default)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.8627
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      8388
           1       0.67      0.29      0.40      1612

    accuracy                           0.86     10000
   macro avg       0.78      0.63      0.66     10000
weighted avg       0.84      0.86      0.84     10000

