### Importing necessary libraries

In [385]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

### Loading the dataset

In [386]:
df = pd.read_csv("loan_application_data.csv")
df

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,$5849.0
1,1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,$6091.0
2,2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,$3000.0
3,3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,$4941.0
4,4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,$6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,LP002586,Female,Yes,1,Graduate,No,3326,913.0,105.0,84.0,1.0,Semiurban,Y,$4239.0
496,496,LP002587,Male,Yes,0,Not Graduate,No,2600,1700.0,107.0,360.0,1.0,Rural,Y,$4300.0
497,497,LP002588,Male,Yes,0,Graduate,No,4625,2857.0,111.0,12.0,,Urban,Y,$7482.0
498,498,LP002600,Male,Yes,1,Graduate,Yes,2895,0.0,95.0,360.0,1.0,Semiurban,Y,$2895.0


From above we got to know that the dataset which we're using has:
- 500 rows
- 15 columns

### Getting basic information about the data

In [387]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         500 non-null    int64  
 1   Loan_ID            500 non-null    object 
 2   Gender             491 non-null    object 
 3   Married            497 non-null    object 
 4   Dependents         488 non-null    object 
 5   Education          500 non-null    object 
 6   Self_Employed      473 non-null    object 
 7   ApplicantIncome    500 non-null    int64  
 8   CoapplicantIncome  500 non-null    float64
 9   LoanAmount         482 non-null    float64
 10  Loan_Amount_Term   486 non-null    float64
 11  Credit_History     459 non-null    float64
 12  Property_Area      500 non-null    object 
 13  Loan_Status        500 non-null    object 
 14  Total_Income       500 non-null    object 
dtypes: float64(4), int64(2), object(9)
memory usage: 58.7+ KB


There are missing values in the dataset so first step would be to drop the missing values

### Statistical evaluation of data

In [388]:
df.describe(include = "all")

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
count,500.0,500,491,497,488.0,500,473,500.0,500.0,482.0,486.0,459.0,500,500,500
unique,,500,2,2,4.0,2,2,,,,,,3,2,457
top,,LP002602,Male,Yes,0.0,Graduate,No,,,,,,Semiurban,Y,$6000.0
freq,,1,400,322,288.0,393,407,,,,,,188,345,4
mean,249.5,,,,,,,5493.644,1506.30784,144.020747,342.54321,0.843137,,,
std,144.481833,,,,,,,6515.668972,2134.432188,82.344919,63.834977,0.364068,,,
min,0.0,,,,,,,150.0,0.0,17.0,12.0,0.0,,,
25%,124.75,,,,,,,2874.5,0.0,100.0,360.0,1.0,,,
50%,249.5,,,,,,,3854.0,1125.5,126.5,360.0,1.0,,,
75%,374.25,,,,,,,5764.0,2253.25,161.5,360.0,1.0,,,


### Dropping the `Unnamed` and `Loan_ID` column

In [389]:
df = df.drop(columns = ["Loan_ID", "Unnamed: 0"])
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,$5849.0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,$6091.0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,$3000.0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,$4941.0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,$6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Female,Yes,1,Graduate,No,3326,913.0,105.0,84.0,1.0,Semiurban,Y,$4239.0
496,Male,Yes,0,Not Graduate,No,2600,1700.0,107.0,360.0,1.0,Rural,Y,$4300.0
497,Male,Yes,0,Graduate,No,4625,2857.0,111.0,12.0,,Urban,Y,$7482.0
498,Male,Yes,1,Graduate,Yes,2895,0.0,95.0,360.0,1.0,Semiurban,Y,$2895.0


### Count of null values in numeric columns

In [390]:
df.isnull().sum()

Gender                9
Married               3
Dependents           12
Education             0
Self_Employed        27
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           18
Loan_Amount_Term     14
Credit_History       41
Property_Area         0
Loan_Status           0
Total_Income          0
dtype: int64

In [391]:
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
Total_Income          object
dtype: object

### Converting the numeric values to numbers

In [392]:
df["Total_Income"] = (
    df["Total_Income"]
    .astype(str)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
)

In [393]:
num_cols_to_num = [
    "Dependents", "Total_Income"
]

for col in num_cols_to_num:
    df[col] = pd.to_numeric(df[col], errors = "coerce")

In [394]:
df.dtypes

Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
Total_Income         float64
dtype: object

### Handling numeric missing values
Replacing the `null` values with `median` values

In [395]:
num_cols = [
    "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"
]

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

### Handling categorical missing values
Replacing the `null` values with `median` values

In [396]:
cat_cols = [
    "Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"
]

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

### Changing `Loan_Status` from categorical value to numeric value

In [397]:
df['Loan_Status'] = df['Loan_Status'].map({'Y': 0, 'N': 1})

### Checking unique values in categorical data

In [398]:
for col in cat_cols:
    print(col, df[col].unique())

Gender ['Male' 'Female']
Married ['No' 'Yes']
Dependents [0. 1. 2.]
Education ['Graduate' 'Not Graduate']
Self_Employed ['No' 'Yes']
Property_Area ['Urban' 'Rural' 'Semiurban']


### Checking for total null values

In [399]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
Total_Income         0
dtype: int64

### Checking data types count

In [400]:
df.dtypes.value_counts()

float64    6
object     5
int64      2
Name: count, dtype: int64

In [401]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             500 non-null    object 
 1   Married            500 non-null    object 
 2   Dependents         500 non-null    float64
 3   Education          500 non-null    object 
 4   Self_Employed      500 non-null    object 
 5   ApplicantIncome    500 non-null    int64  
 6   CoapplicantIncome  500 non-null    float64
 7   LoanAmount         500 non-null    float64
 8   Loan_Amount_Term   500 non-null    float64
 9   Credit_History     500 non-null    float64
 10  Property_Area      500 non-null    object 
 11  Loan_Status        500 non-null    int64  
 12  Total_Income       500 non-null    float64
dtypes: float64(6), int64(2), object(5)
memory usage: 50.9+ KB


# Model training

### Splitting the features and labels

In [402]:
X = df.drop("Loan_Status", axis = 1)
y = df["Loan_Status"]

### One-hot encoding

In [414]:
X = pd.get_dummies(X)

### Splitting the data into training and testing split

In [403]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

In [416]:
X.dtypes.unique()

array([dtype('float64'), dtype('int64'), dtype('bool')], dtype=object)

### Scaling the features

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns from X_train (which now has dummy variables)
num_cols = X_train.select_dtypes(include = ["int64", "float64"]).columns

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

### Converting the `bool` to `int`

In [None]:
bool_cols = X_train.select_dtypes(include="bool").columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

### Locking the features by converting them to list

In [417]:
feature_names = X_train.columns.to_list()
print("Features:", len(feature_names))

Features: 18


### Training the Logistic model

In [418]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter = 1000,
    class_weight = "balanced"
)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


### Verifying the alignment

In [419]:
print("Model coef size:", model.coef_.shape[1])
print("Feature names:", len(feature_names))

assert model.coef_.shape[1] == len(feature_names)

Model coef size: 18
Feature names: 18


### Feature importance

In [421]:
feature_importance = pd.Series(
    model.coef_[0],
    index = feature_names
).sort_values()

feature_importance

Credit_History            -1.335368
Property_Area_Semiurban   -0.299293
Married_Yes               -0.097641
Dependents                -0.088464
ApplicantIncome           -0.081325
Total_Income              -0.069032
Education_Graduate        -0.040818
Self_Employed_No          -0.032935
Gender_Male               -0.026523
Gender_Female              0.026523
Self_Employed_Yes          0.032935
CoapplicantIncome          0.035423
Education_Not Graduate     0.040818
Married_No                 0.097641
Property_Area_Urban        0.100847
Loan_Amount_Term           0.214661
Property_Area_Rural        0.215936
LoanAmount                 0.309566
dtype: float64

### Predictions

In [422]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [423]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

confusion_matrix(y_test, y_pred)

Accuracy: 0.72
Precision: 0.5454545454545454
Recall: 0.5806451612903226
F1-score: 0.5625
ROC-AUC: 0.7928938756428238


array([[54, 15],
       [13, 18]])

# Input (Test)

In [424]:
sample_input = {
    "Gender": "Male",
    "Married": "Yes",
    "Dependents": 1,
    "Education": "Graduate",
    "Self_Employed": "No",
    "ApplicantIncome": 5000,
    "CoapplicantIncome": 2000,
    "LoanAmount": 150,
    "Loan_Amount_Term": 360,
    "Credit_History": 1,
    "Property_Area": "Urban",
    "Total_Income": 7000,
    "Loan_Income_Ratio": 150 / 7000
}

In [425]:
def predict_credit_risk(input_data, model, scaler, feature_columns):
    # Convert input to DataFrame
    input_df = pd.DataFrame([input_data])

    # One-hot encode
    input_df = pd.get_dummies(input_df)

    # Align columns with training data
    input_df = input_df.reindex(columns=feature_columns, fill_value=0)

    # Scale
    input_scaled = scaler.transform(input_df)

    # Predict
    risk_prob = model.predict_proba(input_scaled)[0][1]
    risk_class = model.predict(input_scaled)[0]

    result = {
        "Risk_Class": "High Risk" if risk_class == 1 else "Low Risk",
        "Risk_Probability": round(risk_prob, 3)
    }

    return result

In [426]:
result = predict_credit_risk(
    sample_input,
    model,
    scaler,
    X.columns
)

result



{'Risk_Class': 'Low Risk', 'Risk_Probability': np.float64(0.357)}

In [427]:
print(result)

{'Risk_Class': 'Low Risk', 'Risk_Probability': np.float64(0.357)}


In [428]:
import joblib

# Save model
joblib.dump(model, "credit_risk_model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save feature names
joblib.dump(feature_names, "feature_names.pkl")

['feature_names.pkl']