In [None]:
import pandas as pd

df = pd.read_csv("IDS_2025_project/train.csv")  

# Remove ID column 
if "id" in df.columns:
    df = df.drop(columns=["id"])

df.head()


Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [11]:
# Groupby explorations – average loan_paid_back by grade_subgrade

default_by_grade = (
    df.groupby("grade_subgrade")["loan_paid_back"]
      .mean()
      .sort_values(ascending=False)
)

print("Average loan_paid_back by grade_subgrade:")
display(default_by_grade)


Average loan_paid_back by grade_subgrade:


grade_subgrade
A4    0.957084
A3    0.955470
A2    0.952924
A1    0.952500
A5    0.944962
B3    0.940040
B2    0.937430
B5    0.934204
B4    0.931758
B1    0.916341
C1    0.860090
C2    0.851165
C5    0.846259
C4    0.843987
C3    0.836000
D1    0.731886
D2    0.720957
D4    0.714733
D5    0.713000
D3    0.695972
E5    0.669461
E2    0.662743
E1    0.652010
E4    0.649577
E3    0.641837
F5    0.639314
F4    0.637037
F1    0.624503
F2    0.617721
F3    0.604093
Name: loan_paid_back, dtype: float64

In [12]:
# Groupby explorations – average loan_paid_back by loan_purpose

default_by_purpose = (
    df.groupby("loan_purpose")["loan_paid_back"]
      .mean()
      .sort_values(ascending=False)
)

print("Average loan_paid_back by loan_purpose:")
display(default_by_purpose)


Average loan_paid_back by loan_purpose:


loan_purpose
Home                  0.823224
Business              0.813104
Other                 0.802377
Car                   0.800630
Debt consolidation    0.796911
Vacation              0.796071
Medical               0.778085
Education             0.777053
Name: loan_paid_back, dtype: float64

In [13]:
# Groupby explorations – average loan_paid_back by employment_status

default_by_employment = (
    df.groupby("employment_status")["loan_paid_back"]
      .mean()
      .sort_values(ascending=False)
)

print("Average loan_paid_back by employment_status:")
display(default_by_employment)


Average loan_paid_back by employment_status:


employment_status
Retired          0.997204
Self-employed    0.898457
Employed         0.894145
Student          0.263515
Unemployed       0.077619
Name: loan_paid_back, dtype: float64

In [15]:
# Feature preparation – define target and one-hot encode categorical features

# Sihtmuutuja
target_col = "loan_paid_back"

y = df[target_col]
X = df.drop(columns=[target_col])

# Leiame kategoorilised veerud (object / category tüüpi)
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
print("Categorical columns:", list(categorical_cols))

# Teeme one-hot encodingu kategoorilistele tunnustele
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Shape before encoding:", X.shape)
print("Shape after encoding:", X_encoded.shape)

X_encoded.head()


Categorical columns: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade', 'income_bucket']
Shape before encoding: (593994, 12)
Shape after encoding: (593994, 57)


Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender_Male,gender_Other,marital_status_Married,marital_status_Single,marital_status_Widowed,...,grade_subgrade_E4,grade_subgrade_E5,grade_subgrade_F1,grade_subgrade_F2,grade_subgrade_F3,grade_subgrade_F4,grade_subgrade_F5,income_bucket_medium,income_bucket_high,income_bucket_very_high
0,29367.99,0.084,736,2528.42,13.67,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,22108.02,0.166,636,4593.1,12.92,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,49566.2,0.097,694,17005.15,9.76,True,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3,46858.25,0.065,533,4682.48,16.1,False,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,25496.7,0.053,665,12184.43,10.21,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val_scaled = pd.DataFrame(
    scaler.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

X_train_scaled.head()



Train shape: (475195, 57)
Validation shape: (118799, 57)


Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender_Male,gender_Other,marital_status_Married,marital_status_Single,marital_status_Widowed,...,grade_subgrade_E4,grade_subgrade_E5,grade_subgrade_F1,grade_subgrade_F2,grade_subgrade_F3,grade_subgrade_F4,grade_subgrade_F5,income_bucket_medium,income_bucket_high,income_bucket_very_high
100143,1.653729,-0.783203,-0.810693,-0.094691,0.708702,1.045745,-0.07956,1.069645,-0.973669,-0.106466,...,-0.11674,-0.101557,-0.097172,-0.09384,-0.093291,-0.096674,-0.100526,-0.576505,-0.578174,1.73234
560097,-0.724602,0.019975,-0.684357,-0.945836,1.017512,-0.956256,-0.07956,1.069645,-0.973669,-0.106466,...,-0.11674,-0.101557,-0.097172,-0.09384,-0.093291,-0.096674,-0.100526,1.734589,-0.578174,-0.577254
356847,-0.328959,-1.162887,-0.160962,-1.064718,-0.182859,1.045745,-0.07956,1.069645,-0.973669,-0.106466,...,-0.11674,-0.101557,-0.097172,-0.09384,-0.093291,-0.096674,-0.100526,1.734589,-0.578174,-0.577254
150444,0.876141,1.275854,-0.630212,-1.14004,-0.083244,-0.956256,-0.07956,-0.93489,1.027043,-0.106466,...,-0.11674,-0.101557,-0.097172,-0.09384,-0.093291,-0.096674,-0.100526,-0.576505,-0.578174,1.73234
550361,0.77762,0.954582,0.48877,-0.630108,1.361186,1.045745,-0.07956,1.069645,-0.973669,-0.106466,...,-0.11674,-0.101557,-0.097172,-0.09384,-0.093291,-0.096674,-0.100526,-0.576505,-0.578174,1.73234


In [None]:
# LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

log_reg = LogisticRegression(max_iter=1000)

log_reg.fit(X_train_scaled, y_train)

y_val_proba_lr = log_reg.predict_proba(X_val_scaled)[:, 1]

auc_lr = roc_auc_score(y_val, y_val_proba_lr)
print("Logistic Regression AUC:", auc_lr)


Logistic Regression AUC: 0.9103395267930965


In [None]:
# RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_scaled, y_train)

y_val_proba_rf = rf.predict_proba(X_val_scaled)[:, 1]

auc_rf = roc_auc_score(y_val, y_val_proba_rf)
print("Random Forest AUC:", auc_rf)


Random Forest AUC: 0.9074912782191117


In [19]:
# Model comparison – LogisticRegression vs RandomForest

results = {
    "LogisticRegression": auc_lr,
    "RandomForest": auc_rf,
}

print("Validation AUC scores:")
for name, auc in results.items():
    print(f"{name}: {auc:.4f}")


Validation AUC scores:
LogisticRegression: 0.9103
RandomForest: 0.9075
