# Data Preprocessing

In [2]:
import pandas as pd 
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

## Data Cleaning

In [3]:
train_url = r'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv'
test_url = r'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv'

In [4]:
test_data = pd.read_csv(test_url, sep=",")
test_data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban


In [5]:
train_data = pd.read_csv(train_url, sep=",")
train_data.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [6]:
df = train_data.copy()

In [7]:
#Correcting datatypes
df["Dependents"] = df["Dependents"].replace("3+", 3)

df= df.astype({"Dependents":"float64", "ApplicantIncome":"float64"})

In [8]:
#Checking missing values
print("Missing Values")
missing_values = df.isnull().sum()
if missing_values.sum()>0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values")

#Checking for duplicates
print("\nDuplicate Rows:")
duplicates = df.duplicated().sum()
print(f"Number of duplicated Rows: {duplicates}")

if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(duplicates))*100:.2f}%")
else:
    print("There are no duplicate rows.")

Missing Values
Gender              13
Married              3
Dependents          15
Self_Employed       32
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
dtype: int64

Duplicate Rows:
Number of duplicated Rows: 0
There are no duplicate rows.


In [9]:
#Handling missing values
df.dropna(subset=["LoanAmount"], inplace=True)

In [10]:
df.dropna(subset=["Loan_Amount_Term"], inplace=True)

In [11]:
#Handling missing values
df["Gender"] = df['Gender'].fillna(df["Gender"].mode().iloc[0])
df["Married"] = df['Married'].fillna(df["Married"].mode().iloc[0])
df["Self_Employed"] = df['Self_Employed'].fillna(df["Self_Employed"].mode().iloc[0])

df["Dependents"] = df['Dependents'].fillna(df["Dependents"].median())
df["Credit_History"] = df['Credit_History'].fillna(df["Credit_History"].median())


In [12]:
df["Loan_Status_Int"] = (df["Loan_Status"] == 'Y').astype(int)

In [13]:
def map_credit_history_category(c):
    """
    Converts credit history to a categorical feature
    """
    if c == 1:
        return "Good"
    else:
        return "Bad"

In [14]:
df["Credit_History_Label"] = df["Credit_History"].apply(map_credit_history_category)

In [15]:
df.isnull().sum()

Loan_ID                 0
Gender                  0
Married                 0
Dependents              0
Education               0
Self_Employed           0
ApplicantIncome         0
CoapplicantIncome       0
LoanAmount              0
Loan_Amount_Term        0
Credit_History          0
Property_Area           0
Loan_Status             0
Loan_Status_Int         0
Credit_History_Label    0
dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 578 entries, 1 to 613
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Loan_ID               578 non-null    object 
 1   Gender                578 non-null    object 
 2   Married               578 non-null    object 
 3   Dependents            578 non-null    float64
 4   Education             578 non-null    object 
 5   Self_Employed         578 non-null    object 
 6   ApplicantIncome       578 non-null    float64
 7   CoapplicantIncome     578 non-null    float64
 8   LoanAmount            578 non-null    float64
 9   Loan_Amount_Term      578 non-null    float64
 10  Credit_History        578 non-null    float64
 11  Property_Area         578 non-null    object 
 12  Loan_Status           578 non-null    object 
 13  Loan_Status_Int       578 non-null    int64  
 14  Credit_History_Label  578 non-null    object 
dtypes: float64(6), int64(1), obj

## Preprocessing

Encode Gender

In [17]:
#This a binary category (we can have label or binary encoding)

le = LabelEncoder()
df["gender_encoded"] = le.fit_transform(df["Gender"])

#Viewing the encoded data against the actual data
df[["gender_encoded", "Gender"]]

Unnamed: 0,gender_encoded,Gender
1,1,Male
2,1,Male
3,1,Male
4,1,Male
5,1,Male
...,...,...
609,0,Female
610,1,Male
611,1,Male
612,1,Male


In [18]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)