In [1]:
import pandas as pd                         #importing pandas
import numpy as np                          #importing numpy
import seaborn as sns                       #data plotting 
import matplotlib.pyplot                    #importing for plotting 
from sklearn.preprocessing import LabelEncoder      #importing to convert object type data to numerical
from sklearn.model_selection    import train_test_split         #for data splitting 
from sklearn import svm                     #importing support vector machine 
from sklearn.metrics import accuracy_score  #to check model accuracy score 

In [2]:
df=pd.read_csv("C:/Users/Lenovo/Downloads/loan_pred_data.csv")

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
#statistical measures
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
#printing all columns
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [8]:
#checking missing valus in each column
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [14]:
#checking data information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 43.2+ KB


In [15]:
#replacing missing values for object types data we are replacing by mode, else replacing by mean

for col in df.columns:
    if df[col].dtypes == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)
        
#checking data types
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [17]:
df['Loan_Status']

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object

In [18]:
#Label encoding with replace method 

df.replace({"Loan_Status" : {'N' :0, 'Y': 1}}, inplace=True)

df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [19]:
#Dependent Columns values

df['Dependents'].value_counts()

0     360
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [20]:
#Replacing 3+ value with 4 (as our model will give error )

df.replace({'Dependents': {'3+' : 4}}, inplace=True)

df['Dependents'].value_counts()

0    360
1    102
2    101
4     51
Name: Dependents, dtype: int64

In [30]:
#changing Dependent column data type to int
df['Dependents'] = df['Dependents'].astype('int')

In [32]:
#using label econder to convert columns to numerical 



# Initialize a label encoder
label_encoder = LabelEncoder()


categorical_columns = ['Loan_ID', 'Gender', 'Married', 'Education',  'Self_Employed', 'Property_Area']

for col in categorical_columns:
    df[col] =  label_encoder.fit_transform(df[col])


#checking datatypes
df.dtypes

Loan_ID                int32
Gender                 int32
Married                int32
Dependents             int32
Education              int32
Self_Employed          int32
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int32
Loan_Status            int64
dtype: object

In [34]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


# Modelling and Features Engineering

In [36]:
#Creating two variable X = Features columns and 

X = df.drop(columns=['Loan_ID', 'Loan_Status'], axis=1)           #droppping Loan_ID is its not useful, and Loan_Status

Y = df['Loan_Status']                         #Target

print(X)

     Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0         1        0           0          0              0             5849   
1         1        1           1          0              0             4583   
2         1        1           0          0              1             3000   
3         1        1           0          1              0             2583   
4         1        0           0          0              0             6000   
..      ...      ...         ...        ...            ...              ...   
609       0        0           0          0              0             2900   
610       1        1           4          0              0             4106   
611       1        1           1          0              0             8072   
612       1        1           2          0              0             7583   
613       0        0           0          0              1             4583   

     CoapplicantIncome  LoanAmount  Loan_Amount_Ter

In [37]:
print(Y)

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64


In [38]:
#Splitting data to start the model
#Creating four variable to split the data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, stratify=Y)           #testsize = 10, stratified labe to me almost equally   

In [39]:
#printing the shape 

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(552, 11) (62, 11) (552,) (62,)


# Training Modelling



Support Vector Machine Model

In [40]:
classifier = svm.SVC(kernel='linear')

In [41]:
#Trainging the support vector machine model

classifier.fit(X_train, Y_train)

SVC(kernel='linear')

# Model Evaluation

In [42]:
#accuracy on training data

pred_on_training_data = classifier.predict(X_train)

accuracy_on_training_data = accuracy_score(Y_train, pred_on_training_data)

accuracy_on_training_data

0.7753623188405797

In [None]:
print("Accuray on Training Data {:.2%}".format(accuracy_on_training_data ))

In [43]:
#Accuracy on Test Data

pred_on_test_data = classifier.predict(X_test)

accuracy_on_test_data = accuracy_score(Y_test, pred_on_test_data)


print("Accuray on Test/Unseen Data {:.2}".format(accuracy_on_test_data ))

Accuray on Test/Unseen Data 0.77


In [44]:
X_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
419,1,1,0,0,0,3246,1417.0,138.0,360.0,1.0,1
405,0,0,0,0,0,4547,0.0,115.0,360.0,1.0,1
240,1,1,2,0,0,5819,5000.0,120.0,360.0,1.0,0
337,1,1,2,0,1,2500,4600.0,176.0,360.0,1.0,0
329,0,0,0,0,0,2500,0.0,67.0,360.0,1.0,2


In [45]:
print("Accuray on Test/Unseen Data {:.2%}".format(accuracy_on_test_data ))

Accuray on Test/Unseen Data 77.42%
