In [140]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [141]:
df = pd.read_csv('datasets/loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Features

| Feature | คำแปล / ความหมาย | คำอธิบายเพิ่มเติม |
|---------|-----------------|-----------------|
| **Loan_ID** | รหัสเงินกู้ | หมายเลขอ้างอิงเฉพาะของแต่ละรายการเงินกู้ ใช้เพื่อระบุตัวตนไม่ให้ซ้ำกัน |
| **Gender** | เพศของผู้กู้ | ระบุว่า “ชาย” หรือ “หญิง” |
| **Married** | สถานะการสมรส | บอกว่าผู้กู้ “แต่งงานแล้ว” หรือ “โสด” |
| **Dependents** | จำนวนสมาชิกในครอบครัวที่อยู่ในความดูแล | เช่น บุตร หรือผู้สูงอายุที่ต้องดูแล ระบุเป็นจำนวนคน |
| **Education** | ระดับการศึกษา | ระบุระดับวุฒิการศึกษาของผู้กู้ เช่น “Graduate” (จบปริญญา) หรือ “Not Graduate” (ยังไม่จบปริญญา) |
| **Self_Employed** | สถานะการทำงานอิสระ | บอกว่าผู้กู้เป็น “พนักงานประจำ” หรือ “ประกอบอาชีพอิสระ” |
| **ApplicantIncome** | รายได้ต่อเดือนของผู้กู้หลัก | รายได้เฉลี่ยต่อเดือนของผู้ยื่นกู้ (บาท หรือหน่วยเงินอื่น ๆ) |
| **CoapplicantIncome** | รายได้ต่อเดือนของผู้ร่วมกู้ | รายได้ของผู้ร่วมกู้ (เช่น คู่สมรส หรือญาติ) |
| **LoanAmount** | จำนวนเงินที่ขอกู้ | ระบุจำนวนเงินกู้ที่ผู้ยื่นขอ |
| **Loan_Amount_Term** | ระยะเวลาผ่อนชำระเงินกู้ (เป็นวัน) | เช่น 360 วัน หรือ 480 วัน หมายถึงระยะเวลาทั้งหมดที่ต้องชำระคืน |
| **Credit_History** | ประวัติการชำระหนี้ | ใช้ดูว่าผู้กู้เคยมีการชำระเงินกู้ตรงเวลาหรือไม่ (1 = มีประวัติชำระดี, 0 = ไม่มีประวัติหรือชำระไม่ดี) |
| **Property_Area** | พื้นที่ที่ตั้งทรัพย์สิน | เช่น “Urban” (ในเมือง), “Rural” (ชนบท), หรือ “Semiurban” (กึ่งเมือง) |
| **Loan_Status** | สถานะของเงินกู้ | แสดงผลการอนุมัติ เช่น “Y” = อนุมัติ, “N” = ไม่อนุมัติ |


### View Basic Info

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### Is there any duplicates?

In [143]:
print(f"Dataset size before dropping {df.shape}")
df = df.drop_duplicates()
print(f"Dataset size after dropping {df.shape}")

Dataset size before dropping (614, 13)
Dataset size after dropping (614, 13)


In [144]:
df = df.drop("Loan_ID", axis=1)

### Check Features

In [145]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-" * 30)

Column: Gender
['Male' 'Female' nan]
------------------------------
Column: Married
['No' 'Yes' nan]
------------------------------
Column: Dependents
['0' '1' '2' '3+' nan]
------------------------------
Column: Education
['Graduate' 'Not Graduate']
------------------------------
Column: Self_Employed
['No' 'Yes' nan]
------------------------------
Column: ApplicantIncome
[ 5849  4583  3000  2583  6000  5417  2333  3036  4006 12841  3200  2500
  3073  1853  1299  4950  3596  3510  4887  2600  7660  5955  3365  3717
  9560  2799  4226  1442  3750  4166  3167  4692  3500 12500  2275  1828
  3667  3748  3600  1800  2400  3941  4695  3410  5649  5821  2645  4000
  1928  3086  4230  4616 11500  2708  2132  3366  8080  3357  3029  2609
  4945  5726 10750  7100  4300  3208  1875  4755  5266  1000  3333  3846
  2395  1378  3988  2366  8566  5695  2958  6250  3273  4133  3620  6782
  2484  1977  4188  1759  4288  4843 13650  4652  3816  3052 11417  7333
  3800  2071  5316  2929  3572  7451  50

In [146]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [147]:
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Loan_Amount_Term', 'Loan_Status']
continuous_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

### What are missing columns?

In [148]:
def find_missing_columns(df, category_names, continuous_names):
    missing_cols = df.columns[df.isna().any()].tolist()
    
    result = {
        'category': [col for col in missing_cols if col in category_names],
        'continuous': [col for col in missing_cols if col in continuous_names]
    }
    return result

In [149]:
missing_list = find_missing_columns(df, categorical_features, continuous_features)
missing_list

{'category': ['Gender',
  'Married',
  'Dependents',
  'Self_Employed',
  'Loan_Amount_Term',
  'Credit_History'],
 'continuous': ['LoanAmount']}

In [150]:
missing_categorical = missing_list['category']
missing_continuous = missing_list['continuous']
print(missing_categorical)
print(missing_continuous)

['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
['LoanAmount']


# Handling Missing Categorical Features by Mode

In [151]:
df[categorical_features].isna().sum()

Gender              13
Married              3
Dependents          15
Education            0
Self_Employed       32
Credit_History      50
Property_Area        0
Loan_Amount_Term    14
Loan_Status          0
dtype: int64

In [152]:
from sklearn.impute import SimpleImputer

In [153]:
cat_impulter = SimpleImputer(strategy='most_frequent', missing_values=np.nan)

In [154]:
df[missing_categorical] = cat_impulter.fit_transform(df[missing_categorical])

### Checking if any missing categorical data again

In [155]:
df[categorical_features].isna().sum()

Gender              0
Married             0
Dependents          0
Education           0
Self_Employed       0
Credit_History      0
Property_Area       0
Loan_Amount_Term    0
Loan_Status         0
dtype: int64

### Check missing columns value after fill

In [156]:
for col in missing_categorical:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-" * 30)

Column: Gender
['Male' 'Female']
------------------------------
Column: Married
['No' 'Yes']
------------------------------
Column: Dependents
['0' '1' '2' '3+']
------------------------------
Column: Self_Employed
['No' 'Yes']
------------------------------
Column: Loan_Amount_Term
[360.0 120.0 240.0 180.0 60.0 300.0 480.0 36.0 84.0 12.0]
------------------------------
Column: Credit_History
[1.0 0.0]
------------------------------


# Handling Missing Continuous Features by Mean

In [157]:
df[continuous_features].isna().sum()

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
dtype: int64

In [158]:
cont_impulter = SimpleImputer(strategy='mean', missing_values=np.nan)

In [159]:
df[missing_continuous] = cont_impulter.fit_transform(df[missing_continuous])

### Checking if any missing continuous data again

In [160]:
df[continuous_features].isna().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
dtype: int64

### Check missing values after fill

In [161]:
for col in missing_continuous:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-" * 30)

Column: LoanAmount
[146.41216216 128.          66.         120.         141.
 267.          95.         158.         168.         349.
  70.         109.         200.         114.          17.
 125.         100.          76.         133.         115.
 104.         315.         116.         112.         151.
 191.         122.         110.          35.         201.
  74.         106.         320.         144.         184.
  80.          47.          75.         134.          96.
  88.          44.         286.          97.         135.
 180.          99.         165.         258.         126.
 312.         136.         172.          81.         187.
 113.         176.         130.         111.         167.
 265.          50.         210.         175.         131.
 188.          25.         137.         160.         225.
 216.          94.         139.         152.         118.
 185.         154.          85.         259.         194.
  93.         370.         182.         650.         

## Check whole df info again

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    object 
 9   Credit_History     614 non-null    object 
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(2), int64(1), object(9)
memory usage: 57.7+ KB


In [163]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


---

# Outlier Handling

### Interquartile Range

In [164]:
def cap_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    n_outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    print(f"{col}: {n_outliers} outliers")

    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

### Replacing Outlier with maximum cap

In [165]:
for col in continuous_features:
    cap_outliers_iqr(df, col)

ApplicantIncome: 50 outliers
CoapplicantIncome: 18 outliers
LoanAmount: 41 outliers


In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    float64
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    object 
 9   Credit_History     614 non-null    object 
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(3), object(9)
memory usage: 57.7+ KB


---

## Scalier
will scale these columns
- ApplicantIncome
- CoapplicantIncome
- LoanAmount

In [167]:
from sklearn.preprocessing import StandardScaler

In [168]:
columns_to_scale = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [169]:
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [170]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,0.497164,-0.874587,0.150494,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,-0.013767,0.054395,-0.179896,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,-0.652632,-0.874587,-1.292433,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,-0.820924,0.578025,-0.323449,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,0.558104,-0.874587,0.053377,360.0,1.0,Urban,Y


---

## One-Hot Encoding

In [171]:
df_encoded = pd.get_dummies(df, columns=categorical_features)
df_encoded.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,...,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,Loan_Amount_Term_120.0,Loan_Amount_Term_180.0,Loan_Amount_Term_240.0,Loan_Amount_Term_300.0,Loan_Amount_Term_360.0,Loan_Amount_Term_480.0,Loan_Status_N,Loan_Status_Y
0,0.497164,-0.874587,0.150494,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
1,-0.013767,0.054395,-0.179896,False,True,False,True,False,True,False,...,False,False,False,False,False,False,True,False,True,False
2,-0.652632,-0.874587,-1.292433,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,-0.820924,0.578025,-0.323449,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
4,0.558104,-0.874587,0.053377,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True


In [172]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          614 non-null    float64
 1   CoapplicantIncome        614 non-null    float64
 2   LoanAmount               614 non-null    float64
 3   Gender_Female            614 non-null    bool   
 4   Gender_Male              614 non-null    bool   
 5   Married_No               614 non-null    bool   
 6   Married_Yes              614 non-null    bool   
 7   Dependents_0             614 non-null    bool   
 8   Dependents_1             614 non-null    bool   
 9   Dependents_2             614 non-null    bool   
 10  Dependents_3+            614 non-null    bool   
 11  Education_Graduate       614 non-null    bool   
 12  Education_Not Graduate   614 non-null    bool   
 13  Self_Employed_No         614 non-null    bool   
 14  Self_Employed_Yes        6

# Save into new csv

In [173]:
df_encoded.to_csv("datasets/loan_encoded.csv", index=False)

### Also save the scaler

In [174]:
import joblib
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']