# Import All Required Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Load Loan Dataset 

In [2]:
data = pd.read_csv('loan_data.csv')

# Exploring Data

## 1. Display Top 5 Rows of The Dataset

In [3]:
data.head()

## 2. Display Last 5 Rows of The Dataset

In [4]:
data.tail()

## 3. Find Shape of Our Dataset (Number of Rows And Number of Columns)

In [5]:
data.shape

In [6]:
print("Number of Rows",data.shape[0])
print("Number of Columns",data.shape[1])

## 4. Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [7]:
data.info()

## 5. Check Null Values In The Dataset

In [8]:
data.isnull().sum()

# Sum of all the Null Values in the dataset

In [9]:
data.isnull().sum().sum()

## 6. Calculates the percentage of missing values in each column of the DataFrame

In [10]:
data.isnull().sum()*100 / len(data)

# Data Pre-processing

#### Drop the 'Loan_ID' column as it is not needed for the analysis or model training


In [11]:
data = data.drop('Loan_ID',axis=1)

#### Display the first row of the updated DataFrame to verify the change


In [12]:
data.head(5)

# 1. Handle Missing Values

#### Specify columns that must not contain missing values for further analysis


In [13]:
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

#### Heatmap to check the relation between features

In [14]:
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), cbar=False, cmap='coolwarm', linewidths=0.5, linecolor='white',annot=True)
plt.title('Heatmap ', fontsize=18)
plt.show()

#### Drop rows where any of the specified columns have missing values


In [15]:
data = data.dropna(subset=columns)

#### Calculate the percentage of missing values in each column after dropping rows with missing values in specified columns


In [16]:
data.isnull().sum()*100 / len(data)

#### Find the most common value (mode) in the 'Self_Employed' column


In [17]:
data['Self_Employed'].mode()[0]

#### Fill missing values in the 'Self_Employed' column with the most common value


In [18]:
data['Self_Employed'] =data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

#### Calculate the percentage of missing values in each column after filling 'Self_Employed' missing values


In [19]:
data.isnull().sum()*100 / len(data)

#### Display unique values in the 'Gender' column to check data consistency


In [20]:
data['Gender'].unique()

#### Display unique values in the 'Self_Employed' column to check data consistency after filling missing values


In [21]:
data['Self_Employed'].unique()

#### Find the most common value (mode) in the 'Credit_History' column


In [22]:
data['Credit_History'].mode()[0]

#### Fill missing values in the 'Credit_History' column with the most common value


In [23]:
data['Credit_History'] =data['Credit_History'].fillna(data['Credit_History'].mode()[0])

#### Calculate the percentage of missing values in each column after filling 'Credit_History' missing values


In [24]:
data.isnull().sum()*100 / len(data)

# 7. Handling Categorical Columns

#### Display a random sample of 5 rows from the DataFrame to get an idea of the data distribution and categories


In [25]:
data.sample(5)


#### Count Plot for Categorical Features Before Encoding

In [26]:
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
sns.countplot(ax=axes[0, 0], x='Gender', data=data)
sns.countplot(ax=axes[0, 1], x='Married', data=data)
sns.countplot(ax=axes[0, 2], x='Dependents', data=data)
sns.countplot(ax=axes[1, 0], x='Education', data=data)
sns.countplot(ax=axes[1, 1], x='Self_Employed', data=data)
sns.countplot(ax=axes[1, 2], x='Property_Area', data=data)
plt.tight_layout()
plt.show()

#### Replace the value '3+' in the 'Dependents' column with '4' to standardize the categories for easier processing


In [27]:
data['Dependents'] =data['Dependents'].replace(to_replace="3+",value='4')

#### Verify the unique values in the 'Dependents' column after replacement


In [28]:
data['Dependents'].unique()

#### Verify the unique values in the 'Loan_Status' column


In [29]:
data['Loan_Status'].unique()

#### Convert categorical values in the 'Gender' column to numerical values

In [30]:
# Convert categorical values in the 'Gender' column to numerical (Male as 1 and Female as 0)
data['Gender'] = data['Gender'].map({'Male':1,'Female':0}).astype('int')
# Convert categorical values in the 'Married' column to numerical values (Yes: 1, No: 0)
data['Married'] = data['Married'].map({'Yes':1,'No':0}).astype('int')
# Convert categorical values in the 'Education' column to numerical values (Graduate: 1, Not Graduate: 0)
data['Education'] = data['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
# Convert categorical values in the 'Self_Employed' column to numerical values (Yes: 1, No: 0)
data['Self_Employed'] = data['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
# Convert categorical values in the 'Property_Area' column to numerical values (Rural: 0, Semiurban: 2, Urban: 1)
data['Property_Area'] = data['Property_Area'].map({'Rural':0,'Semiurban':2,'Urban':1}).astype('int')
# Convert categorical values in the 'Loan_Status' column to numerical values (Y: 1, N: 0)
data['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0}).astype('int')

#### Display the first few rows of the updated DataFrame to verify the changes


In [31]:
data.head()

# 8. Store Feature Matrix In X And Response (Target) In Vector y


#### 'Loan_Status' is the target variable we want to predict, so it's separated from the feature set


In [32]:
X = data.drop('Loan_Status',axis=1)

#### Store the target vector in y by selecting the 'Loan_Status' column from the DataFrame. 'Loan_Status' contains the labels indicating whether a loan was approved (1) or not (0)



In [33]:
y = data['Loan_Status']

#### Display the target vector y to verify it contains the correct values


In [34]:
y

# 9. Feature Scaling

#### Display the first few rows of the DataFrame to understand the initial state of the data


In [35]:
data.head()

#### List of columns to be standardized. These columns contain numerical features with different scales that need to be normalized

In [36]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

#### Fit the StandardScaler to the specified columns and transform them. This standardizes the columns to have a mean of 0 and a standard deviation of 1

In [37]:
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])

#### Display the feature matrix X after scaling to verify the transformation


In [38]:
X

#### Distribution of Numerical Features Before and After Scaling

In [39]:
fig, axes = plt.subplots(2, 4, figsize=(70, 40))
sns.histplot(ax=axes[0, 0], x='ApplicantIncome', data=data, kde=True)
sns.histplot(ax=axes[0, 1], x='CoapplicantIncome', data=data, kde=True)
sns.histplot(ax=axes[0, 2], x='LoanAmount', data=data, kde=True)
sns.histplot(ax=axes[0, 3], x='Loan_Amount_Term', data=data, kde=True)
sns.histplot(ax=axes[1, 0], x=X['ApplicantIncome'], kde=True)
sns.histplot(ax=axes[1, 1], x=X['CoapplicantIncome'], kde=True)
sns.histplot(ax=axes[1, 2], x=X['LoanAmount'], kde=True)
sns.histplot(ax=axes[1, 3], x=X['Loan_Amount_Term'], kde=True)
axes[0, 0].set_title('ApplicantIncome Before Scaling')
axes[0, 1].set_title('CoapplicantIncome Before Scaling')
axes[0, 2].set_title('LoanAmount Before Scaling')
axes[0, 3].set_title('Loan_Amount_Term Before Scaling')
axes[1, 0].set_title('ApplicantIncome After Scaling')
axes[1, 1].set_title('CoapplicantIncome After Scaling')
axes[1, 2].set_title('LoanAmount After Scaling')
axes[1, 3].set_title('Loan_Amount_Term After Scaling')
plt.tight_layout()
plt.show()

# 10. Splitting The Dataset Into The Training Set And Test Set & Applying K-Fold Cross Validation

#### This function takes a model and the feature matrix (X) and target vector (y) as inputs


In [40]:
model_df={}
def model_val(model,X,y):
    # Split the data into training and testing sets
    # Use 80% of the data for training and 20% for testing
    # Set random_state=42 for reproducibility
    X_train,X_test,y_train,y_test=train_test_split(X,y,
                                                   test_size=0.20,
                                                   random_state=42)
    # Fit the model on the training data
    model.fit(X_train,y_train)
    # Make predictions on the testing data
    y_pred=model.predict(X_test)
    # Print the accuracy of the model on the testing data
    print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")
    # Perform 5-fold cross-validation and calculate the average cross-validation score
    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    # Store the average cross-validation score in the model_df dictionary, rounded to 2 decimal places
    model_df[model]=round(np.mean(score)*100,2)


# 11. Applying different Machine Learning models

## 1. Logistic Regression

In [41]:
model = LogisticRegression()
model_val(model,X,y)

In [42]:
model_df

# 2. SVC

In [43]:
model = svm.SVC()
model_val(model,X,y)

# 3. Decision Tree Classifier

In [44]:
model = DecisionTreeClassifier()
model_val(model,X,y)

# 4. Random Forest Classifier

In [45]:
model =RandomForestClassifier()
model_val(model,X,y)

# 5. Gradient Boosting Classifier

In [46]:
model =GradientBoostingClassifier()
model_val(model,X,y)

# 13. Hyperparameter Tuning


# 1 .Logistic Regression (hyperperameter tuning)


In [47]:
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":['liblinear']}

In [48]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                  n_iter=20,cv=5,verbose=True)

In [49]:
rs_log_reg.fit(X,y)

In [50]:
rs_log_reg.best_score_


In [51]:
rs_log_reg.best_params_

# 2. SVC (hyperperameter tuning)

In [52]:
svc_grid = {'C':[0.25,0.50,0.75,1],"kernel":["linear"]}

In [53]:
rs_svc=RandomizedSearchCV(svm.SVC(),
                  param_distributions=svc_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [54]:
rs_svc.fit(X,y)

In [55]:
rs_svc.best_score_

In [56]:
rs_svc.best_params_

# 3. Random Forest Classifier (hyperperameter tuning)

In [57]:
RandomForestClassifier()


In [58]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }

In [59]:
rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [60]:
rs_rf.fit(X,y)


In [61]:
rs_rf.best_score_


In [62]:
rs_rf.best_params_


LogisticRegression score
* Before Hyperparameter Tuning: 80.48
* After Hyperparameter Tuning: 80.48
    
------------------------------------------------------
SVC score
* Before Hyperparameter Tuning: 79.38
* After Hyperparameter Tuning: 80.66
    
--------------------------------------------------------
RandomForestClassifier score
* Before Hyperparameter Tuning: 77.76
* After Hyperparameter Tuning: 80.66

# 14. Save The Model

#### Saving the best Random Forest Classifier model


In [63]:
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [64]:
rf = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)

In [65]:
rf.fit(X,y)


In [75]:
joblib.dump(rf,'loan_status_predict.pkl')


In [67]:
model = joblib.load('loan_status_predict')


#### Creating a sample DataFrame to test the saved model


In [68]:
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [69]:
df


In [70]:
result = model.predict(df)


In [71]:
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

# 15. Model Evaluation on test data

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_pred = model.predict(X_test)
print(y_pred)

#### Evaluate the model's performance


In [73]:

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

In [74]:
# Plotting Confusion Matrix
plt.figure(figsize=(16, 10))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [77]:

import sklearn
import numpy
print("scikit-learn version:", sklearn.__version__)
print("NumPy version:", numpy.__version__)

scikit-learn version: 1.2.2
NumPy version: 1.26.4
