In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('loan_approval.csv')  # Replace with the correct file path
print(df.head())  # Display the first 5 rows

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [3]:
# Check dataset info
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Check summary statistics for numerical columns
print(df.describe())

# Check unique values in categorical columns
print(df['Gender'].unique())  # Example for Gender column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education  

In [4]:
# Check missing values again
print(df.isnull().sum())

# Handle missing values
# For numerical columns, impute with median
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

# For categorical columns, impute with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Verify if missing values are handled
print(df.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

In [5]:
# Detect outliers in numerical columns (e.g., LoanAmount)
Q1 = df['LoanAmount'].quantile(0.25)
Q3 = df['LoanAmount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cap outliers to the upper and lower bounds
df['LoanAmount'] = df['LoanAmount'].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

In [6]:
# Encode categorical variables
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

# Check the updated dataset
print(df.head())

    Loan_ID Dependents  ApplicantIncome  CoapplicantIncome  LoanAmount  \
0  LP001002          0             5849                0.0       128.0   
1  LP001003          1             4583             1508.0       128.0   
2  LP001005          0             3000                0.0        66.0   
3  LP001006          0             2583             2358.0       120.0   
4  LP001008          0             6000                0.0       141.0   

   Loan_Amount_Term  Credit_History Loan_Status  Gender_Male  Married_Yes  \
0             360.0             1.0           Y         True        False   
1             360.0             1.0           N         True         True   
2             360.0             1.0           Y         True         True   
3             360.0             1.0           Y         True         True   
4             360.0             1.0           Y         True        False   

   Education_Not Graduate  Self_Employed_Yes  Property_Area_Semiurban  \
0                  

In [7]:
# Save the cleaned dataset
df.to_csv('cleaned_loan_prediction.csv', index=False)

## 2.1 Descriptive Statistics

In [8]:
# Import necessary libraries
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('cleaned_loan_prediction.csv')

# Display descriptive statistics for numerical columns
print(df.describe())

# Display frequency distribution for categorical columns
print(df['Loan_Status'].value_counts())  # Target variable
print(df['Credit_History'].value_counts())  # Example for another categorical column

       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         614.000000  614.000000         600.00000   
mean       5403.459283        1621.245798  137.365635         342.00000   
std        6109.041673        2926.248369   55.779749          65.12041   
min         150.000000           0.000000    9.000000          12.00000   
25%        2877.500000           0.000000  100.250000         360.00000   
50%        3812.500000        1188.500000  128.000000         360.00000   
75%        5795.000000        2297.250000  164.750000         360.00000   
max       81000.000000       41667.000000  261.500000         480.00000   

       Credit_History  
count      614.000000  
mean         0.855049  
std          0.352339  
min          0.000000  
25%          1.000000  
50%          1.000000  
75%          1.000000  
max          1.000000  
Loan_Status
Y    422
N    192
Name: count, dtype: int64
Credit_History
1.0    525
0.0     89
Name: count, dt

## Step 2.2: Hypothesis Testing

In [9]:
# Import necessary libraries
from scipy.stats import chi2_contingency

# Create a contingency table for Credit_History and Loan_Status
contingency_table = pd.crosstab(df['Credit_History'], df['Loan_Status'])

# Perform Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print("Chi-square Test Results:")
print(f"Chi2 Statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies Table:")
print(expected)

Chi-square Test Results:
Chi2 Statistic: 176.1145746235241
P-value: 3.4183499979091188e-40
Degrees of Freedom: 1
Expected Frequencies Table:
[[ 27.83061889  61.16938111]
 [164.16938111 360.83061889]]


## Step 2.3: t-test for ApplicantIncome

In [10]:
# Import necessary libraries
from scipy.stats import ttest_ind

# Separate ApplicantIncome for approved and rejected loans
income_approved = df[df['Loan_Status'] == 'Y']['ApplicantIncome']
income_rejected = df[df['Loan_Status'] == 'N']['ApplicantIncome']

# Perform t-test
t_stat, p_value = ttest_ind(income_approved, income_rejected)

print("t-test Results for ApplicantIncome:")
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")

t-test Results for ApplicantIncome:
t-statistic: -0.11650844828724542
p-value: 0.907287812130518


## Step 2.4: Multicollinearity Check

In [18]:
# Impute missing values in Loan_Amount_Term
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)

# Verify if missing values are handled
print(df['Loan_Amount_Term'].isnull().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)


In [19]:
# Import necessary libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Select independent variables for VIF calculation
independent_vars = df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']]

# Calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data['Variable'] = independent_vars.columns
vif_data['VIF'] = [variance_inflation_factor(independent_vars.values, i) for i in range(independent_vars.shape[1])]

print("VIF Results:")
print(vif_data)

VIF Results:
            Variable       VIF
0    ApplicantIncome  2.321698
1  CoapplicantIncome  1.456320
2         LoanAmount  9.008554
3   Loan_Amount_Term  9.639508
4     Credit_History  5.871926


## Step 1: Create a New Feature (Monthly Loan Payment)

In [20]:
# Create a new feature: Monthly Loan Payment
df['Monthly_Loan_Payment'] = df['LoanAmount'] / df['Loan_Amount_Term']

# Drop the original LoanAmount and Loan_Amount_Term columns
df.drop(['LoanAmount', 'Loan_Amount_Term'], axis=1, inplace=True)

# Verify the new feature
print(df[['Monthly_Loan_Payment']].head())

   Monthly_Loan_Payment
0              0.355556
1              0.355556
2              0.183333
3              0.333333
4              0.391667


In [22]:
# Select independent variables for VIF calculation
independent_vars = df[['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Credit_History']]

# Calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data['Variable'] = independent_vars.columns
vif_data['VIF'] = [variance_inflation_factor(independent_vars.values, i) for i in range(independent_vars.shape[1])]

print("Updated VIF Results:")
print(vif_data)

Updated VIF Results:
               Variable       VIF
0       ApplicantIncome  1.712168
1     CoapplicantIncome  1.305973
2  Monthly_Loan_Payment  1.916867
3        Credit_History  2.219233


## Step 3: Outlier Detection Using Cook’s Distance
Cook’s Distance measures the influence of each data point on the regression model. Points with a Cook’s Distance greater than a threshold (typically 4/n, where n is the number of observations) are considered influential outliers.

In [23]:
# Import necessary libraries
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Prepare the data for logistic regression
# Encode the target variable (Loan_Status) as binary (1 for 'Y', 0 for 'N')
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# Define independent variables (features) and dependent variable (target)
X = df[['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Credit_History']]
X = sm.add_constant(X)  # Add a constant for the intercept term
y = df['Loan_Status']

# Fit the logistic regression model
model = sm.Logit(y, X)
results = model.fit()

# Calculate Cook's Distance
influence = results.get_influence()
cooks_distance = influence.cooks_distance[0]

# Identify outliers (points with Cook's Distance > 4/n)
n = len(df)
outlier_threshold = 4 / n
outliers = cooks_distance > outlier_threshold

# Print the number of outliers and their indices
print(f"Number of outliers: {outliers.sum()}")
print(f"Outlier indices: {df.index[outliers].tolist()}")

# Add Cook's Distance to the dataframe for visualization
df['Cooks_Distance'] = cooks_distance

# Display rows with outliers
print(df[outliers])

Optimization terminated successfully.
         Current function value: 0.476154
         Iterations 6
Number of outliers: 23
Outlier indices: [9, 122, 155, 171, 183, 201, 262, 267, 284, 308, 326, 333, 369, 402, 417, 453, 487, 497, 527, 546, 581, 585, 600]
      Loan_ID Dependents  ApplicantIncome  CoapplicantIncome  Credit_History  \
9    LP001020          1            12841            10968.0             1.0   
122  LP001431          0             2137             8980.0             0.0   
155  LP001536         3+            39999                0.0             0.0   
171  LP001585         3+            51763                0.0             1.0   
183  LP001637          1            33846                0.0             1.0   
201  LP001677          2             4923                0.0             0.0   
262  LP001870          1             3481                0.0             1.0   
267  LP001882         3+             4333             1811.0             0.0   
284  LP001922          0

## Step 1: Remove Extreme Outliers

In [25]:
# Remove the row with the highest Cook's Distance
df_cleaned = df.drop(index=497)

# Verify the removal
print(f"Number of rows after removal: {len(df_cleaned)}")

Number of rows after removal: 613


In [26]:
# Prepare the cleaned data for logistic regression
X_cleaned = df_cleaned[['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Credit_History']]
X_cleaned = sm.add_constant(X_cleaned)  # Add a constant for the intercept term
y_cleaned = df_cleaned['Loan_Status']

# Fit the logistic regression model on cleaned data
model_cleaned = sm.Logit(y_cleaned, X_cleaned)
results_cleaned = model_cleaned.fit()

# Calculate Cook's Distance for the cleaned data
influence_cleaned = results_cleaned.get_influence()
cooks_distance_cleaned = influence_cleaned.cooks_distance[0]

# Identify outliers in the cleaned data
outlier_threshold_cleaned = 4 / len(df_cleaned)
outliers_cleaned = cooks_distance_cleaned > outlier_threshold_cleaned

# Print the number of outliers and their indices
print(f"Number of outliers after removal: {outliers_cleaned.sum()}")
print(f"Outlier indices after removal: {df_cleaned.index[outliers_cleaned].tolist()}")

Optimization terminated successfully.
         Current function value: 0.474795
         Iterations 6
Number of outliers after removal: 24
Outlier indices after removal: [9, 68, 122, 155, 183, 201, 242, 262, 267, 284, 308, 326, 333, 369, 402, 409, 417, 453, 487, 527, 546, 581, 585, 600]


# **Model Building Process**

In [27]:
from sklearn.model_selection import train_test_split

# Define independent variables (features) and dependent variable (target)
X = df_cleaned[['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Credit_History']]
y = df_cleaned['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (490, 4)
Testing set shape: (123, 4)


In [28]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Print the model coefficients
print("Model Coefficients:")
print(model.coef_)

Model Coefficients:
[[ 1.27943698e-05 -4.52776624e-05 -5.56031282e-01  3.48525970e+00]]


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7805
Precision: 0.7714
Recall: 0.9643
F1-Score: 0.8571
ROC-AUC Score: 0.6745
Confusion Matrix:
[[15 24]
 [ 3 81]]


## Step 1: Feature Engineering

In [30]:
# Create a new feature: Total Income
df_cleaned['Total_Income'] = df_cleaned['ApplicantIncome'] + df_cleaned['CoapplicantIncome']

# Normalize numerical features
from sklearn.preprocessing import StandardScaler

# Define numerical features to normalize
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Total_Income']

# Initialize the scaler
scaler = StandardScaler()

# Normalize the numerical features
df_cleaned[numerical_features] = scaler.fit_transform(df_cleaned[numerical_features])

# Verify the updated dataset
print(df_cleaned.head())

    Loan_ID Dependents  ApplicantIncome  CoapplicantIncome  Credit_History  \
0  LP001002          0         0.072724          -0.553427             1.0   
1  LP001003          1        -0.134512          -0.038017             1.0   
2  LP001005          0        -0.393640          -0.553427             1.0   
3  LP001006          0        -0.461900           0.252500             1.0   
4  LP001008          0         0.097442          -0.553427             1.0   

   Loan_Status  Gender_Male  Married_Yes  Education_Not Graduate  \
0            1         True        False                   False   
1            0         True         True                   False   
2            1         True         True                   False   
3            1         True         True                    True   
4            1         True        False                   False   

   Self_Employed_Yes  Property_Area_Semiurban  Property_Area_Urban  \
0              False                    False       

In [31]:
# Define independent variables (features) and dependent variable (target)
X = df_cleaned[['ApplicantIncome', 'CoapplicantIncome', 'Monthly_Loan_Payment', 'Credit_History', 'Total_Income']]
y = df_cleaned['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7805
Precision: 0.7714
Recall: 0.9643
F1-Score: 0.8571
ROC-AUC Score: 0.6745
Confusion Matrix:
[[15 24]
 [ 3 81]]
