In [73]:
# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations

# Importing modules for model training and evaluation
from sklearn.model_selection import train_test_split  # To split the data into training and testing sets
from sklearn.model_selection import cross_val_score  # To perform cross-validation
from sklearn.model_selection import RandomizedSearchCV  # For hyperparameter tuning using randomized search

# Importing machine learning models
from sklearn.linear_model import LogisticRegression  # Logistic Regression for classification
from sklearn import svm  # Support Vector Machines for classification tasks
from sklearn.tree import DecisionTreeClassifier  # Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Ensemble models: Random Forest and Gradient Boosting

# For data preprocessing
from sklearn.preprocessing import StandardScaler  # StandardScaler to standardize the dataset

# For evaluating model performance
from sklearn.metrics import accuracy_score  # To compute the accuracy of predictions

# For saving and loading trained models
import joblib  # To save trained machine learning models for future use


In [4]:
# Importing the required module to handle file uploads in Google Colab
from google.colab import files
uploaded = files.upload() # Allows you to upload a file from your local system into the Colab environment


Saving loan_data.csv to loan_data.csv


In [5]:
# Reading the 'loan_data.csv' file into a pandas DataFrame
df = pd.read_csv('loan_data.csv')


In [6]:
# Displaying the first few rows of the dataset
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [7]:
# Checking the shape of the dataset
#df.shape outputs (381, 13), it means the dataset has 381 rows and 13 columns.
df.shape

(381, 13)

In [8]:
# Displaying the last few rows of the dataset
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [9]:
# Displaying a summary of the dataset

df.info()

# Example observations from the output:
# - The dataset has 381 rows and 13 columns.
# - Columns like `Gender`, `Dependents`, `Self_Employed`, `Loan_Amount_Term`, and `Credit_History` have missing values.
# - Most columns are categorical (`object`), while a few are numeric (`int64`, `float64`).
# - `Loan_ID` is likely a unique identifier and may not be useful for modeling.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [10]:
#Handling Missing Value
df.isnull().sum()

# Explanation:
# - 'isnull()' creates a DataFrame of boolean values indicating where NaNs are present.
# - 'sum()' aggregates the count of missing values for each column.

Unnamed: 0,0
Loan_ID,0
Gender,5
Married,0
Dependents,8
Education,0
Self_Employed,21
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,11


In [11]:
# Checking the percentage of missing values for each column
df. isnull().mean()*100

Unnamed: 0,0
Loan_ID,0.0
Gender,1.312336
Married,0.0
Dependents,2.099738
Education,0.0
Self_Employed,5.511811
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,0.0
Loan_Amount_Term,2.887139


In [14]:
# Printing the column names of the DataFrame
print(df.columns)


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [15]:
# Dropping the 'Loan_ID' column from the DataFrame because it is usually an identifier and doesn’t contribute to the predictive power of the model
df = df.drop(columns=['Loan_ID'])


In [16]:
# Displaying the first few rows of the updated dataset
# It helps to confirm that the 'Loan_ID' column has been successfully dropped.
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [17]:
# Dropping rows where any of the specified columns have missing values
df = df.dropna(subset=['Gender', 'Dependents', 'Loan_Amount_Term'])

In [18]:
# Checking the shape of the updated DataFrame
df.shape

(358, 12)

In [22]:
(358-30-11)

317

In [23]:
# Checking for missing values after data cleaning
df.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,20
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,30


In [24]:
# Checking the unique values in the 'Self_Employed' column
df['Self_Employed'].unique()
# In this case, 'Self_Employed' contains 'No', 'Yes', and NaN (missing values), which confirms it’s a categorical feature.

array(['No', 'Yes', nan], dtype=object)

In [26]:
# Finding the most frequent value (mode) in the 'Self_Employed' column
df['Self_Employed'].mode()[0]
#In this case, the mode of 'Self_Employed' is 'No', which is the most common category

'No'

In [27]:
# Checking the unique values in the 'Credit_History' column
df['Credit_History'].unique()
#In this case, 'Credit_History' contains 1 (indicating a positive credit history), 0 (indicating a negative credit history), and NaN (missing values).
# This confirms that 'Credit_History' is a numerical column, where NaN represents missing values.

array([ 1., nan,  0.])

In [29]:
# Finding the most frequent value (mode) in the 'Credit_History' column
df['Credit_History'].mode()[0]
#'[0]' is used to access the first mode value since 'mode()' returns a Series.
# - In this case, the mode of 'Credit_History' is `1.0`,  you can fill the missing values with 1.0 (indicating a positive credit history)

1.0

In [31]:
# Filling missing values in the 'Self_Employed' column with the mode value ('No')
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
# Filling missing values in the 'Credit_History' column with the mode value (1.0)
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])


In [32]:
# Checking for missing values after filling the missing data
df.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [33]:
# Checking the information of the DataFrame after filling missing values
df.info()
#As per the output, all columns now have 358 non-null entries, indicating that missing values have been handled successfully.

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    object 
 1   Married            358 non-null    object 
 2   Dependents         358 non-null    object 
 3   Education          358 non-null    object 
 4   Self_Employed      358 non-null    object 
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    object 
 11  Loan_Status        358 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 36.4+ KB


In [34]:
# Checking the unique values in the 'Gender' column
df['Gender'].unique()
# - In this case, the 'Gender' column contains two unique values: 'Male' and 'Female'.
# - This confirms that 'Gender' is a binary categorical feature, which might need encoding .

array(['Male', 'Female'], dtype=object)

In [35]:
# Checking the unique values in the 'Dependents' column
df['Dependents'].unique()
# - In this case, 'Dependents' contains values '1', '0', '2', and '3+', where '3+' indicates 3 or more dependents.
# - This confirms that 'Dependents' is a categorical variable that needs to be encoded or transformed before using it in a machine learning model.

array(['1', '0', '2', '3+'], dtype=object)

In [37]:
# Replacing '3+' with '4' in the 'Dependents' column
df['Dependents'] = df['Dependents'].replace('3+', '4')
# - This helps to standardize the values in the 'Dependents' column, making it easier to work with during modeling.


In [38]:
# Checking the unique values in the 'Dependents' column after replacing '3+' with '4'
df['Dependents'].unique()

array(['1', '0', '2', '4'], dtype=object)

In [39]:
# Checking the unique values in the 'Married' column
df['Married'].unique()
# - In this case, the 'Married' column contains two unique values: 'Yes' and 'No'.
# - This confirms that 'Married' is a binary categorical feature, which may need to be encoded into numerical values for machine learning models.

array(['Yes', 'No'], dtype=object)

In [41]:
# Checking the information of the DataFrame after preprocessing
df.info()

# Explanation:
# - 'info()' provides a summary of the DataFrame, including the number of non-null values in each column and their data types.
# - It confirms that all columns now have 358 non-null entries, indicating that missing values have been successfully handled.
# - The columns with numerical data types (e.g., int64, float64) and categorical data types (e.g., object) are listed.
# - The dataset is now clean and ready for encoding and further model building.

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    object 
 1   Married            358 non-null    object 
 2   Dependents         358 non-null    object 
 3   Education          358 non-null    object 
 4   Self_Employed      358 non-null    object 
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    object 
 11  Loan_Status        358 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 36.4+ KB


In [42]:
# Defining a dictionary to encode categorical variables
encoding = {
    'Gender': {'Male': 1, 'Female': 0},
    'Married': {'Yes': 1, 'No': 0},
    'Dependents': {'0': 0, '1': 1, '2': 2, '4': 4},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed': {'Yes': 1, 'No': 0},
    'Property_Area': {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status': {'Y': 1, 'N': 0}
}

# Explanation:
# - The 'encoding' dictionary is used to map the categorical variables to numerical values.

In [44]:
# Replacing categorical values with encoded numerical values in the DataFrame
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [45]:
# Checking the DataFrame information after encoding categorical variables
df.info()

# Explanation:
# - After applying the 'replace()' method with the 'encoding' dictionary, all categorical variables are now encoded as integers.
# - The DataFrame now consists of 358 non-null entries and contains columns with numerical data types ('int64' and 'float64').

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    int64  
 1   Married            358 non-null    int64  
 2   Dependents         358 non-null    int64  
 3   Education          358 non-null    int64  
 4   Self_Employed      358 non-null    int64  
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    int64  
 11  Loan_Status        358 non-null    int64  
dtypes: float64(4), int64(8)
memory usage: 36.4 KB


In [49]:
# Separating features (X) and target variable (y)
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

# Explanation:
# - 'X' contains all the columns except the target column 'Loan_Status' which will be used as input features for model training.
# - 'y' contains the 'Loan_Status' column, which is the output label that the model will try to predict.

In [50]:
# Displaying the first few rows of the DataFrame after encoding categorical variables
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,1,1


In [51]:
# Standardizing the numerical columns using StandardScaler
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
# StandardScaler is used to scale numerical features so that they have a mean of 0 and a standard deviation of 1.
# This ensures that all numerical features are on the same scale, which can improve the performance of machine learning models.
scaler = StandardScaler()
# Applying the standardization to the numerical columns in X
X[num_cols] = scaler.fit_transform(X[num_cols])

In [52]:
# Displaying the first few rows of the feature set (X) after standardization
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,0.71163,0.092069,0.80598,0.285826,1.0,0
1,1,1,0,1,1,-0.398856,-0.539332,-1.350425,0.285826,1.0,1
2,1,1,0,0,0,-0.691384,0.447965,0.527735,0.285826,1.0,1
3,1,0,0,1,0,1.705666,-0.539332,1.25813,0.285826,1.0,1
4,1,1,0,0,0,-0.866761,0.095418,-0.341784,0.285826,1.0,1


In [57]:
def evaluate_model(model):
   # Splitting the dataset into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Training the model using the training data
    model.fit(X_train, y_train)
    # Predicting the target variable on the test data
    y_pred = model.predict(X_test)
    # Calculating the accuracy of the model by comparing the predicted values with the actual values in y_test
    accuracy = accuracy_score(y_test, y_pred)
    # Performing cross-validation (5-fold) to assess the model's performance across different subsets of the data
    cross_val = cross_val_score(model, X, y, cv=5)
    # Calculating the average cross-validation score
    avg_cross_val = np.mean(cross_val)
    # Printing the model name, accuracy, and average cross-validation score
    print(f"{model.__class__.__name__} - Accuracy: {accuracy: .2f} , Cross_Val_Score: {avg_cross_val: .2f}")
     # Returning the average cross-validation score to allow further analysis
    return avg_cross_val

In [74]:
# Defining a set of different machine learning models to evaluate the loan approval prediction task
models = {
    LogisticRegression(),  # Logistic Regression model for binary classification, often used for predicting probabilities of classes.
    svm.SVC(),             # Support Vector Classifier, a powerful classifier that works well for both linear and non-linear data.
    DecisionTreeClassifier(),  # Decision Tree Classifier, a model that splits data based on feature values to make predictions.
    RandomForestClassifier(),  # Random Forest Classifier, an ensemble model of decision trees that improves performance and reduces overfitting.
    GradientBoostingClassifier()  # Gradient Boosting Classifier, another ensemble model that builds trees sequentially to minimize errors.
}


In [58]:
# Create a dictionary comprehension to evaluate each model and store its performance
model_score = {model.__class__.__name__: evaluate_model(model) for model in models}


LogisticRegression - Accuracy:  0.85 , Cross_Val_Score:  0.84
DecisionTreeClassifier - Accuracy:  0.85 , Cross_Val_Score:  0.78
GradientBoostingClassifier - Accuracy:  0.85 , Cross_Val_Score:  0.82
SVC - Accuracy:  0.85 , Cross_Val_Score:  0.83
RandomForestClassifier - Accuracy:  0.86 , Cross_Val_Score:  0.82


In [75]:
# Overview of Model Performance:

# Logistic Regression, SVC, Gradient Boosting, and Decision Tree all achieved around 85% accuracy.
# Random Forest performed slightly better with 86% accuracy.
# Cross-validation scores show Logistic Regression is the most consistent, while Decision Tree had the lowest.
# Random Forest has the highest accuracy, but Logistic Regression offers a good balance of accuracy and consistency.


In [64]:
# Tunes the model by testing different settings to find the best ones.
# Returns the model with the best settings and score.
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv=5, n_iter =20, verbose = True, random_state = 42)
    tuner.fit(X, y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [63]:
# Defines the parameter grids for tuning different models:
# - For Logistic Regression, it tests different regularization values and solver methods.
# - For SVM, it tests different values for the regularization parameter and kernel type.
# - For Random Forest, it explores various tree configurations, like the number of trees and depth.

log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
svc_grid = {'C': [0.25,0.50,0.75,1], "kernel": ['linear']}
rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'],
    'max_depth': [None, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20, 50,100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [65]:
tune_model(LogisticRegression(), log_reg_grid)
#The Logistic Regression model was tuned using RandomizedSearchCV, and the best parameters found were C = 1.62 and solver = 'liblinear'. This configuration achieved an accuracy of 84% based on cross-validation.

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.84
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': 1.623776739188721}


In [66]:
best_svc_reg = tune_model(svm.SVC(), svc_grid)
#The SVC model was tuned with C = 0.25 and kernel = 'linear', achieving 84% accuracy.



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score for SVC: 0.84
Best Parameter for SVC: {'kernel': 'linear', 'C': 0.25}


In [67]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)
# The Random Forest model achieved 84% accuracy with optimized parameters: n_estimators = 930, min_samples_split = 50, min_samples_leaf = 10, and max_features = 'sqrt'.

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.84
Best Parameter for RandomForestClassifier: {'n_estimators': 930, 'min_samples_split': 50, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 20}


In [68]:
#The final model is the optimized Random Forest (best_rf)
final_model = best_rf

In [70]:
#The final model has been saved as loan_status_predictor.pkl using joblib
joblib.dump(final_model, 'loan_status_predictor.pkl')


['loan_status_predictor.pkl']

In [71]:
# Prediction System

sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [0],
    'Self_Employed': [0],
    'ApplicantIncome': [1000],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [150],
    'Loan_Amount_Term': [180],
    'Credit_History': [0],
    'Property_Area': [1]
})

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('loan_status_predictor.pkl')
prediction = loaded_model.predict(sample_data)

result = "Loan Approved" if prediction[0] == 1 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")


Prediction Result: Loan Not Approved


In [76]:
#The prediction system has successfully processed the sample data, and based on the model, the loan status is predicted as "Loan Not Approved."

In [72]:
joblib.dump(scaler, 'vector.pkl')

['vector.pkl']

In [77]:
#Scaler saved as vector.pkl for future use.