In [1]:
import pandas as pd

# Load dataset
data_path = r"C:\Users\GURUDEEP SINGH\OneDrive\Documents\Loan Default\Loan_default.csv"
data = pd.read_csv(data_path)

# Basic data inspection
print(data.head())
print(data.info())
print(data.describe())


       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes           Yes       Other   
1      Full-time    

In [2]:
# Check for non-numeric values in numeric columns
print(data.apply(lambda s: pd.to_numeric(s, errors='coerce')).isnull().sum())


LoanID            255347
Age                    0
Income                 0
LoanAmount             0
CreditScore            0
MonthsEmployed         0
NumCreditLines         0
InterestRate           0
LoanTerm               0
DTIRatio               0
Education         255347
EmploymentType    255347
MaritalStatus     255347
HasMortgage       255347
HasDependents     255347
LoanPurpose       255347
HasCoSigner       255347
Default                0
dtype: int64


In [3]:
# Convert columns to numeric, coercing errors to NaN
data_cleaned = data.apply(lambda s: pd.to_numeric(s, errors='coerce'))

# Check the first few rows to understand the changes
print(data_cleaned.head())

# Handle NaN values - for simplicity, we can drop them (you can use other strategies too)
data_cleaned = data_cleaned.dropna()


   LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0     NaN   56   85994       50587          520              80   
1     NaN   69   50432      124440          458              15   
2     NaN   46   84208      129188          451              26   
3     NaN   32   31713       44799          743               0   
4     NaN   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education  \
0               4         15.23        36      0.44        NaN   
1               1          4.81        60      0.68        NaN   
2               3         21.17        24      0.31        NaN   
3               3          7.07        24      0.23        NaN   
4               4          6.51        48      0.73        NaN   

   EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  \
0             NaN            NaN          NaN            NaN          NaN   
1             NaN            NaN          NaN 

In [4]:
# One-hot encoding categorical features
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Verify changes
print(data.head())


       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  ...  \
0               4         15.23        36      0.44  ...   
1               1          4.81        60      0.68  ...   
2               3         21.17        24      0.31  ...   
3               3          7.07        24      0.23  ...   
4               4          6.51        48      0.73  ...   

   EmploymentType_Unemployed  MaritalStatus_Married  MaritalStatus_Single  \
0                      False                  False                 False   
1                      False                   True       

In [5]:
# Create new features
data['CreditScore_Bucket'] = data['CreditScore'].apply(lambda x: 'High' if x > 700 else ('Medium' if x > 600 else 'Low'))
data['Income_to_LoanAmount'] = data['Income'] / data['LoanAmount']
data['DTI_Bucket'] = data['DTIRatio'].apply(lambda x: 'Low' if x < 0.3 else ('Medium' if x < 0.6 else 'High'))

# One-hot encoding new features
data = pd.get_dummies(data, columns=['CreditScore_Bucket', 'DTI_Bucket'], drop_first=True)

# Verify changes
print(data.head())


       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  ...  \
0               4         15.23        36      0.44  ...   
1               1          4.81        60      0.68  ...   
2               3         21.17        24      0.31  ...   
3               3          7.07        24      0.23  ...   
4               4          6.51        48      0.73  ...   

   LoanPurpose_Business  LoanPurpose_Education  LoanPurpose_Home  \
0                 False                  False             False   
1                 False                  False             False   
2       

In [6]:
from sklearn.model_selection import train_test_split

X = data.drop(['Default', 'LoanID'], axis=1)  # Excluding 'LoanID' as it's likely not useful for predictions
y = data['Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(204277, 29) (51070, 29) (204277,) (51070,)


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression Model
pd_model = LogisticRegression(max_iter=500)
pd_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
pd_preds = pd_model.predict(X_test_scaled)
print(pd_preds[:10])  # Output the first 10 predictions


[0 0 0 0 0 0 0 0 0 0]


In [9]:
from sklearn.metrics import classification_report, accuracy_score

# PD Evaluation
print("Accuracy:", accuracy_score(y_test, pd_preds))
print("Classification Report:")
print(classification_report(y_test, pd_preds))


Accuracy: 0.8859212845114549
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45170
           1       0.61      0.03      0.06      5900

    accuracy                           0.89     51070
   macro avg       0.75      0.52      0.50     51070
weighted avg       0.86      0.89      0.84     51070



In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train the LGD model
lgd_model = LinearRegression()
lgd_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
lgd_preds = lgd_model.predict(X_test_scaled)
print("LGD Predictions:", lgd_preds[:10])
print("LGD Mean Squared Error:", mean_squared_error(y_test, lgd_preds))


LGD Predictions: [ 0.00358436  0.05707864  0.09771599  0.15331234  0.16131584  0.17950595
  0.07219976  0.06697962 -0.00976321  0.12647127]
LGD Mean Squared Error: 0.09343924154565102
