In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
# Make sure you upload the correct .csv file to Colab
try:
    df = pd.read_csv('loan_approval_dataset.csv') # Make sure the filename matches yours
except FileNotFoundError:
    print("Error: Dataset file not found. Please upload it to your Colab session.")
    exit()

# Display first 5 rows and basic info
print("--- First 5 Rows ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()

--- First 5 Rows ---
   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0    

In [2]:
# Check for missing values
print("\n--- Missing Values Before Cleaning ---")
print(df.isnull().sum())

# Fill missing values in categorical columns with the mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    if col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Fill missing values in numerical columns with the median
for col in ['LoanAmount', 'Loan_Amount_Term']:
     if col in df.columns:
        df[col].fillna(df[col].median(), inplace=True)

print("\n--- Missing Values After Cleaning ---")
print(df.isnull().sum())


--- Missing Values Before Cleaning ---
loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

--- Missing Values After Cleaning ---
loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64


In [3]:
# Drop the Loan_ID column as it is not needed for prediction
if 'Loan_ID' in df.columns:
    df = df.drop('Loan_ID', axis=1)

# Encode categorical columns into numbers
# We'll use LabelEncoder for this
encoder = LabelEncoder()
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

for col in categorical_cols:
    if col in df.columns:
        df[col] = encoder.fit_transform(df[col])

print("\n--- Data After Encoding ---")
print(df.head())


--- Data After Encoding ---
   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value 

In [5]:
print(df.columns)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [11]:
# ==============================================================================
#           FINAL, ROBUST CODE FOR STEP 2: Preprocessing
# ==============================================================================
import pandas as pd
from sklearn.preprocessing import LabelEncoder

print("--- Step 2: Preprocessing Data ---")

# --- Part A: Clean Column Names ---
# Removes extra spaces from the start/end of column names
df.columns = df.columns.str.strip()
print("Cleaned column names successfully.")

# --- Part B: Clean Text Data ---
# Removes extra spaces from the start/end of text values inside the columns
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.strip()
print("Cleaned text data values successfully.")

# --- Part C: Handle Missing Values ---
# Fill missing text columns with the most frequent value (mode)
for col in ['gender', 'married', 'dependents', 'self_employed', 'credit_history']:
    if col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Fill missing number columns with the middle value (median)
for col in ['loan_amount', 'loan_amount_term']:
     if col in df.columns:
        df[col].fillna(df[col].median(), inplace=True)
print("Handled missing values successfully.")

# --- Part D: Encode All Text Columns to Numbers ---
# Automatically finds and converts any remaining text columns to numbers
encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = encoder.fit_transform(df[col])
print("Encoded all text columns to numbers successfully.")

# --- Final Check ---
print("\n--- Final Data Types (all should be numbers) ---")
df.info()

--- Step 2: Preprocessing Data ---
Cleaned column names successfully.
Cleaned text data values successfully.
Handled missing values successfully.
Encoded all text columns to numbers successfully.

--- Final Data Types (all should be numbers) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   loan_id                   4269 non-null   int64
 1   no_of_dependents          4269 non-null   int64
 2   education                 4269 non-null   int64
 3   self_employed             4269 non-null   int64
 4   income_annum              4269 non-null   int64
 5   loan_amount               4269 non-null   int64
 6   loan_term                 4269 non-null   int64
 7   cibil_score               4269 non-null   int64
 8   residential_assets_value  4269 non-null   int64
 9   commercial_assets_value   4269 non-null   int64
 10  luxury

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [12]:
# ==============================================================================
#                      STEP 3: TRAIN THE MODEL
# ==============================================================================
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

print("--- Step 3: Defining Features & Splitting Data ---")

# --- Define Features (X) and Target (y) ---
# We use 'loan_status' as the target and drop the unnecessary 'loan_id'
X = df.drop(['loan_status', 'loan_id'], axis=1)
y = df['loan_status']

# --- Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets successfully.")


# --- Train the Decision Tree Model ---
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

print("\nDecision Tree model trained successfully! ✅")

--- Step 3: Defining Features & Splitting Data ---
Data split into training and testing sets successfully.

Decision Tree model trained successfully! ✅


In [13]:
from sklearn.metrics import classification_report

print("\n--- Step 4: Evaluating the Decision Tree Model ---")

# Use the trained model to make predictions on the test data
y_pred_dt = dt_model.predict(X_test)

# Print the full evaluation report
# Note: For this dataset, 0 means 'Approved' and 1 means 'Rejected'
print("Decision Tree Performance:")
print(classification_report(y_test, y_pred_dt, target_names=['Approved', 'Rejected']))


--- Step 4: Evaluating the Decision Tree Model ---
Decision Tree Performance:
              precision    recall  f1-score   support

    Approved       0.98      0.98      0.98       536
    Rejected       0.97      0.97      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("\n--- Bonus: Training a Logistic Regression Model ---")

# Create and train the model
# max_iter=1000 helps the model find a solution without giving an error
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions with the new model
y_pred_lr = lr_model.predict(X_test)

# Print the evaluation report for this new model
print("\nLogistic Regression Performance:")
print(classification_report(y_test, y_pred_lr, target_names=['Approved', 'Rejected']))


--- Bonus: Training a Logistic Regression Model ---

Logistic Regression Performance:
              precision    recall  f1-score   support

    Approved       0.80      0.91      0.85       536
    Rejected       0.80      0.61      0.69       318

    accuracy                           0.80       854
   macro avg       0.80      0.76      0.77       854
weighted avg       0.80      0.80      0.79       854



In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("\n--- Final Bonus: Applying SMOTE to Balance Data ---")

# Create the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE ONLY to the training data to create a new, balanced training set
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Shape of training data before SMOTE:", y_train.value_counts())
print("Shape of training data after SMOTE:", y_train_smote.value_counts())

# Train a new Logistic Regression model on the balanced data
lr_model_smote = LogisticRegression(max_iter=1000, random_state=42)
lr_model_smote.fit(X_train_smote, y_train_smote)

# Make predictions on the original, unbalanced test set
y_pred_smote = lr_model_smote.predict(X_test)

# Print the final evaluation report
print("\n--- Performance with SMOTE-balanced Data ---")
print(classification_report(y_test, y_pred_smote, target_names=['Approved', 'Rejected']))


--- Final Bonus: Applying SMOTE to Balance Data ---
Shape of training data before SMOTE: loan_status
0    2120
1    1295
Name: count, dtype: int64
Shape of training data after SMOTE: loan_status
0    2120
1    2120
Name: count, dtype: int64

--- Performance with SMOTE-balanced Data ---
              precision    recall  f1-score   support

    Approved       0.82      0.85      0.83       536
    Rejected       0.72      0.68      0.70       318

    accuracy                           0.78       854
   macro avg       0.77      0.76      0.77       854
weighted avg       0.78      0.78      0.78       854

