#### **1. Import Libraries and Load Data**

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Preprocessing liberaries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# set style forbetter visualiztions
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print("Libraries imported successfully!")

Libraries imported successfully!


In [20]:
# load in the dataset

train_df = pd.read_csv("clean_train_home_loan.csv")
train_df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N


#### **2. EDA-Based Data Quality Assessment**

Based on EDA findings, let's assess specific issues identified

In [23]:
# Creating a copy for preprocessing
df = train_df.copy()

# 1. Checking for missing values (EDA showed on missing values)
print("\n1. Missing Values:")
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

# 2. Checking for duplicates
print("\n2. Duplicate Rows:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

# 3. Checking skewness for variables indified in EDA as a right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variable):")
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
for var in skewed_vars:
    if var in df.columns:
        skewness = skew(df[var])
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")

# 4. Check correlation with target (EDA evidence)
print("\n4. Correlation with Quality (EDA Evidence):")
df.drop(columns='Loan_ID', inplace=True)
correlations = df.corr()['Loan_Status'].sort_values(key=abs, ascending=False)
print(correlations)



1. Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 0

3. Skewness Analysis (EDA identified right-skewed variable):
ApplicantIncome: skewness = 1.037 (right-skewed)
CoapplicantIncome: skewness = 1.010 (right-skewed)
LoanAmount: skewness = 0.682 (right-skewed)

4. Correlation with Quality (EDA Evidence):


ValueError: could not convert string to float: 'Male'

In [22]:
df.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y
