<a href="https://colab.research.google.com/github/Naidu-DS-2026/airline-data-analysis-pbi/blob/main/loan_approval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load Dataset
file_path = '/content/Copy of loan (1).xlsx' # Replace with the correct path
data = pd.read_excel(file_path)
# Display the first few rows
print("Dataset Preview:")
print(data.head())
# Display column information
print("\nDataset Information:")
print(data.info())

Dataset Preview:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2  

In [None]:
# Check for missing values
print("\nMissing Values Before Handling:")
print(data.isnull().sum())

# Handle missing values
for column in data.columns:
    if data[column].dtype == 'object':
        # Fill categorical columns with the mode
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        # Fill numerical columns with the median
        data[column].fillna(data[column].median(), inplace=True)

# Verify that missing values are handled
print("\nMissing Values After Handling:")
print(data.isnull().sum())


Missing Values Before Handling:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Missing Values After Handling:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)


In [None]:
# Encoding categorical variables
label_encoders = {}
for column in ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']:
    # Convert the column to string type before applying LabelEncoder
    data[column] = data[column].astype(str)
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le
print("\nDataset after encoding categorical variables:")
print(data.head())


Dataset after encoding categorical variables:
    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0  LP001002       1        0           0          0              0   
1  LP001003       1        1           1          0              0   
2  LP001005       1        1           0          0              1   
3  LP001006       1        1           0          1              0   
4  LP001008       1        0           0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0       128.0             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History  Property_Area  Loan_Status  
0             1.0              2            1  
1        

In [None]:
# Define Features (X) and Target (y)
X = data.drop(['Loan_ID', 'Loan_Status'], axis=1)  # Drop Loan_ID and target column
y = data['Loan_Status']  # Target variable
print("\nFeatures and Target:")
print("Features (X):\n", X.head())
print("Target (y):\n", y.head())


Features and Target:
Features (X):
    Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0       1        0           0          0              0             5849   
1       1        1           1          0              0             4583   
2       1        1           0          0              1             3000   
3       1        1           0          1              0             2583   
4       1        0           0          0              0             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0       128.0             360.0             1.0   
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   

   Property_Area  
0              2  
1              0  
2              2  
3  

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")


Training set size: 491 rows
Test set size: 123 rows


In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("\nFeature Scaling Completed.")



Feature Scaling Completed.


In [None]:
# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

print("\nModel Training Completed.")



Model Training Completed.


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy: 0.75

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.42      0.54        43
           1       0.75      0.93      0.83        80

    accuracy                           0.75       123
   macro avg       0.75      0.67      0.68       123
weighted avg       0.75      0.75      0.73       123


Confusion Matrix:
[[18 25]
 [ 6 74]]


In [None]:
# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(important_features)



Feature Importances:
              Feature  Importance
9      Credit_History    0.262914
5     ApplicantIncome    0.202666
7          LoanAmount    0.185022
6   CoapplicantIncome    0.113861
8    Loan_Amount_Term    0.051443
10      Property_Area    0.049602
2          Dependents    0.048141
1             Married    0.023648
3           Education    0.021271
0              Gender    0.020718
4       Self_Employed    0.020713


In [None]:
# Save the trained model
import pickle

with open('loan_approval_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("\nModel and Scaler Saved Successfully.")



Model and Scaler Saved Successfully.


In [None]:
import pickle
import numpy as np

# Load the trained model
with open('loan_approval_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Load the scaler
with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Define the Label Encoders (replace with your actual encoders if saved earlier)
# Use the same mappings from training
gender_mapping = {'Male': 1, 'Female': 0}
married_mapping = {'Yes': 1, 'No': 0}
education_mapping = {'Graduate': 1, 'Not Graduate': 0}
self_employed_mapping = {'Yes': 1, 'No': 0}
property_area_mapping = {'Urban': 2, 'Semiurban': 1, 'Rural': 0}
dependents_mapping = {'0': 0, '1': 1, '2': 2, '3+': 3}

# Collect user inputs
print("Enter the following details to predict loan approval status:")

gender = input("Gender (Male/Female): ")
married = input("Married (Yes/No): ")
dependents = input("Number of Dependents (0/1/2/3+): ")
education = input("Education (Graduate/Not Graduate): ")
self_employed = input("Self Employed (Yes/No): ")
applicant_income = float(input("Applicant Income: "))
coapplicant_income = float(input("Coapplicant Income: "))
loan_amount = float(input("Loan Amount: "))
loan_amount_term = float(input("Loan Amount Term (in days): "))
credit_history = int(input("Credit History (1: Yes, 0: No): "))
property_area = input("Property Area (Urban/Semiurban/Rural): ")

# Encode and preprocess the input
input_data = [
    gender_mapping[gender],
    married_mapping[married],
    dependents_mapping[dependents],
    education_mapping[education],
    self_employed_mapping[self_employed],
    applicant_income,
    coapplicant_income,
    loan_amount,
    loan_amount_term,
    credit_history,
    property_area_mapping[property_area],
]
# Convert to numpy array and scale numerical features
input_data = np.array(input_data).reshape(1, -1)
scaled_input_data = scaler.transform(input_data)

# Predict using the model
prediction = model.predict(scaled_input_data)
output = "Approved" if prediction[0] == 1 else "Rejected"

# Display the result
print("\nLoan Approval Status Prediction: ", output)


Enter the following details to predict loan approval status:
Gender (Male/Female): Male
Married (Yes/No): Yes
Number of Dependents (0/1/2/3+): 0
Education (Graduate/Not Graduate): Graduate
Self Employed (Yes/No): No
Applicant Income: 1000000
Coapplicant Income: 100000
Loan Amount: 1000000
Loan Amount Term (in days): 365
Credit History (1: Yes, 0: No): 1
Property Area (Urban/Semiurban/Rural): Urban

Loan Approval Status Prediction:  Rejected


