## Importing the libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Loading the dataset and preprocessing

In [3]:
# Loading the dataset
df = pd.read_csv("archive/dataset.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
# dropping the missing value rows
df1 = df.dropna()
df1.isna().sum() # checking missing values again

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Taking statistical measures

In [7]:
df1.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,480.0,480.0,480.0,480.0,480.0
mean,5364.23125,1581.093583,144.735417,342.05,0.854167
std,5668.251251,2617.692267,80.508164,65.212401,0.353307
min,150.0,0.0,9.0,36.0,0.0
25%,2898.75,0.0,100.0,360.0,1.0
50%,3859.0,1084.5,128.0,360.0,1.0
75%,5852.5,2253.25,170.0,360.0,1.0
max,81000.0,33837.0,600.0,480.0,1.0


### Encodings

loan_status column has "Y" and "N" catogary values. those can be encoded as 1 and 0

In [8]:
df1.replace({'Loan_Status':{'Y':1, 'N':0}}, inplace=True)
df1.head()

  df1.replace({'Loan_Status':{'Y':1, 'N':0}}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.replace({'Loan_Status':{'Y':1, 'N':0}}, inplace=True)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1


lets look at the catogaries at "Dependent" column

In [9]:
df1['Dependents'].value_counts()

Dependents
0     274
2      85
1      80
3+     41
Name: count, dtype: int64

In [10]:
df1 = df1.replace(to_replace='3+', value=4)
df1['Dependents'].value_counts()

Dependents
0    274
2     85
1     80
4     41
Name: count, dtype: int64

In [11]:
df1.nunique()

Loan_ID              480
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      405
CoapplicantIncome    232
LoanAmount           186
Loan_Amount_Term       9
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [12]:
df1.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status            int64
dtype: object

"Gender", "Married", "Dependants", "Education", "Self_Employed" and "Property_Area" columns should be converted into numerical type since if not, the model will not run correctly. we can add label encoding to "Property_Area" column since there is an intrinsic order in the categories.

In [13]:
# making a copy of dataset
df2 = df1.copy()

le = LabelEncoder()
df2["Property_Area"] = le.fit_transform(df2["Property_Area"])
df2.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,2,1


other catogarical columns should be hot encoded

In [14]:
df2_encoded = pd.get_dummies(df2, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed'])
df2_encoded.head()

Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_4,Dependents_0,Dependents_1,Dependents_2,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes
1,LP001003,4583,1508.0,128.0,360.0,1.0,0,0,False,True,False,True,False,False,True,False,True,False,True,False
2,LP001005,3000,0.0,66.0,360.0,1.0,2,1,False,True,False,True,False,True,False,False,True,False,False,True
3,LP001006,2583,2358.0,120.0,360.0,1.0,2,1,False,True,False,True,False,True,False,False,False,True,True,False
4,LP001008,6000,0.0,141.0,360.0,1.0,2,1,False,True,True,False,False,True,False,False,True,False,True,False
5,LP001011,5417,4196.0,267.0,360.0,1.0,2,1,False,True,False,True,False,False,False,True,True,False,False,True


"Loan_ID" is a catogarical column with 480 different catogaries. since this column encoding not suitable, we will remove that column.

In [15]:
dataframe = df2_encoded.drop(['Loan_ID'], axis=1)
dataframe.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_4,Dependents_0,Dependents_1,Dependents_2,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes
1,4583,1508.0,128.0,360.0,1.0,0,0,False,True,False,True,False,False,True,False,True,False,True,False
2,3000,0.0,66.0,360.0,1.0,2,1,False,True,False,True,False,True,False,False,True,False,False,True
3,2583,2358.0,120.0,360.0,1.0,2,1,False,True,False,True,False,True,False,False,False,True,True,False
4,6000,0.0,141.0,360.0,1.0,2,1,False,True,True,False,False,True,False,False,True,False,True,False
5,5417,4196.0,267.0,360.0,1.0,2,1,False,True,False,True,False,False,False,True,True,False,False,True


## Training and Testing

our target variable is 'Loan_Status'. so we should drop it and define other as features.

In [16]:
X = dataframe.drop(['Loan_Status'], axis=1)
y = dataframe['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=12)

#create a classifier
cls = svm.SVC(kernel="linear")
#train the model
cls.fit(X_train,y_train)

In [18]:
#predict the response
pred = cls.predict(X_test)
#accuracy
print("test acuracy:", accuracy_score(y_test,y_pred=pred))

test acuracy: 0.875


In [19]:
pred_train = cls.predict(X_train)
#accuracy
print("train acuracy:", accuracy_score(y_train,y_pred=pred_train))

train acuracy: 0.7685185185185185


Differance between train accuracy and test accuracy is a samaller value. Thus, we can say the model performs well on the unseen data.

### Creating a predictive system

In [24]:
input_data = [2600,1911.0,116.0,360.0,0.0,1,False,True,False,True,False,True,False,False,False,True,True,False]

# getting the input list as numpy array
numpy_array = np.asarray(input_data)

# reshaping the input data
data_reshaped = numpy_array.reshape(1, -1)

# predicting
predict = cls.predict(data_reshaped)

if (predict[0] == 1):
    print("Eligible for the loan")
else:
    print("Not eligible")

Not eligible




In [21]:
csv_file_path = 'output.csv'
dataframe.to_csv(csv_file_path, index=False)