In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('loan_approval.csv')  # Replace with the correct file path
print(df.head())  # Display the first 5 rows

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [3]:
# Check dataset info
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Check summary statistics for numerical columns
print(df.describe())

# Check unique values in categorical columns
print(df['Gender'].unique())  # Example for Gender column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education  

In [4]:
# Check missing values again
print(df.isnull().sum())

# Handle missing values
# For numerical columns, impute with median
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

# For categorical columns, impute with mode
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Verify if missing values are handled
print(df.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

In [5]:
# Detect outliers in numerical columns (e.g., LoanAmount)
Q1 = df['LoanAmount'].quantile(0.25)
Q3 = df['LoanAmount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cap outliers to the upper and lower bounds
df['LoanAmount'] = df['LoanAmount'].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

In [6]:
# Encode categorical variables
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

# Check the updated dataset
print(df.head())

    Loan_ID Dependents  ApplicantIncome  CoapplicantIncome  LoanAmount  \
0  LP001002          0             5849                0.0       128.0   
1  LP001003          1             4583             1508.0       128.0   
2  LP001005          0             3000                0.0        66.0   
3  LP001006          0             2583             2358.0       120.0   
4  LP001008          0             6000                0.0       141.0   

   Loan_Amount_Term  Credit_History Loan_Status  Gender_Male  Married_Yes  \
0             360.0             1.0           Y         True        False   
1             360.0             1.0           N         True         True   
2             360.0             1.0           Y         True         True   
3             360.0             1.0           Y         True         True   
4             360.0             1.0           Y         True        False   

   Education_Not Graduate  Self_Employed_Yes  Property_Area_Semiurban  \
0                  

In [7]:
# Save the cleaned dataset
df.to_csv('cleaned_loan_prediction.csv', index=False)