In [27]:
# ================================
# STEP 1: Import required libraries
# ================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


# ================================
# STEP 2: Load the dataset
# ================================

df = pd.read_csv(
    r"C:/Users/mohdd/OneDrive/Documents/Desktop/GenAI/Data_Cleaning/loan_data.csv"
)

# View first 5 rows
print(df.head())


# ================================
# STEP 3: Basic dataset information
# ================================

print("\nShape of dataset:")
print(df.shape)     # rows, columns

print("\nColumn names:")
print(df.columns)

print("\nDataset info:")
df.info()


# ================================
# STEP 4: Check missing values
# ================================

print("\nMissing values in each column:")
print(df.isnull().sum())


# ================================
# STEP 5: Clean text columns
# (make everything lowercase to avoid duplicates)
# ================================

df["Gender"] = df["Gender"].str.lower()
df["Married"] = df["Married"].str.lower()


# ================================
# STEP 6: Fill missing values
# ================================

# Fill missing Gender with most common value
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])

# Fill missing Married with most common value
df["Married"] = df["Married"].fillna(df["Married"].mode()[0])

print("\nMissing values after filling:")
print(df.isnull().sum())


# ================================
# STEP 7: Select categorical columns
# ================================

df_cat = df[["Gender", "Married"]]




  Loan_ID  Gender Married Dependents     Education Self_Employed  \
0   LP001    Male     Yes          0      Graduate            No   
1   LP002  Female      No          1      Graduate            No   
2   LP003    Male     Yes          2  Not Graduate           Yes   
3   LP004    Male     Yes          0      Graduate            No   
4   LP005  Female     NaN          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  Loan_Amount  Credit_History  \
0           5000.0                0.0        150.0             1.0   
1           3000.0             1500.0        100.0             1.0   
2           4000.0             1800.0        120.0             0.0   
3           6000.0                0.0          NaN             1.0   
4           3500.0                0.0        110.0             1.0   

  Property_Area Loan_Status  
0         Urban           Y  
1     Semiurban           Y  
2         Rural           N  
3         Urban           Y  
4     Semiurban     

In [28]:
# ================================
# STEP 8: One-Hot Encoding
# ================================

encoder = OneHotEncoder(sparse_output=False)

encoded_data = encoder.fit_transform(df_cat)


# ================================
# STEP 9: Convert encoded data to DataFrame
# ================================

encoded_df = pd.DataFrame(
    encoded_data,
    columns=encoder.get_feature_names_out(["Gender", "Married"])
)

print("\nOne-Hot Encoded Data:")
print(encoded_df.head())


# ================================
# STEP 10: Combine encoded data with original dataset
# (Optional but useful)
# ================================

df_final = pd.concat([df.drop(["Gender", "Married"], axis=1), encoded_df], axis=1)

print("\nFinal cleaned dataset:")
print(df_final.head())



One-Hot Encoded Data:
   Gender_female  Gender_male  Married_no  Married_yes
0            0.0          1.0         0.0          1.0
1            1.0          0.0         1.0          0.0
2            0.0          1.0         0.0          1.0
3            0.0          1.0         0.0          1.0
4            1.0          0.0         0.0          1.0

Final cleaned dataset:
  Loan_ID Dependents     Education Self_Employed  ApplicantIncome  \
0   LP001          0      Graduate            No           5000.0   
1   LP002          1      Graduate            No           3000.0   
2   LP003          2  Not Graduate           Yes           4000.0   
3   LP004          0      Graduate            No           6000.0   
4   LP005          0      Graduate            No           3500.0   

   CoapplicantIncome  Loan_Amount  Credit_History Property_Area Loan_Status  \
0                0.0        150.0             1.0         Urban           Y   
1             1500.0        100.0             1.0 

In [24]:
pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

Unnamed: 0,Gender_female,Gender_male,Married_female,Married_male
0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0
5,0.0,1.0,0.0,1.0
6,0.0,1.0,0.0,1.0
7,0.0,1.0,0.0,1.0
8,1.0,0.0,1.0,0.0
9,0.0,1.0,0.0,1.0
