# AV Loan Prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
os.chdir('D:/Datasets/AV_Loan Prediction')

In [4]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
data = data.iloc[:, 1:]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [8]:
data.fillna(0, inplace=True)

In [9]:
encoders = {}
for col, _type in zip(data.columns, data.dtypes):
    if _type == 'object':
        encoders[col] = LabelEncoder().fit(data[col].astype(str))
        data[col] = encoders[col].transform(data[col].astype(str))

In [10]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,2,1,0,0,1,5849,0.0,0.0,360.0,1.0,2,1
1,2,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0,0
2,2,2,0,0,2,3000,0.0,66.0,360.0,1.0,2,1
3,2,2,0,1,1,2583,2358.0,120.0,360.0,1.0,2,1
4,2,1,0,0,1,6000,0.0,141.0,360.0,1.0,2,1


In [11]:
encoders

{'Gender': LabelEncoder(),
 'Married': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'Education': LabelEncoder(),
 'Self_Employed': LabelEncoder(),
 'Property_Area': LabelEncoder(),
 'Loan_Status': LabelEncoder()}

In [12]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [13]:
scale = StandardScaler()
X = scale.fit_transform(X)
X

array([[ 0.48294277, -1.31528634, -0.73780632, ...,  0.31416232,
         0.54095432,  1.22329839],
       [ 0.48294277,  0.72923471,  0.25346957, ...,  0.31416232,
         0.54095432, -1.31851281],
       [ 0.48294277,  0.72923471, -0.73780632, ...,  0.31416232,
         0.54095432,  1.22329839],
       ...,
       [ 0.48294277,  0.72923471,  0.25346957, ...,  0.31416232,
         0.54095432,  1.22329839],
       [ 0.48294277,  0.72923471,  1.24474546, ...,  0.31416232,
         0.54095432,  1.22329839],
       [-1.66580261, -1.31528634, -0.73780632, ...,  0.31416232,
        -1.84858491, -0.04760721]])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [15]:
test.fillna(0, inplace=True)
test = test.iloc[:, 1:]

In [16]:
for col, _type in zip(test.columns, test.dtypes):
    if _type == 'object':
        test[col] = encoders[col].transform(test[col].astype(str))
test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,2,2,0,0,1,5720,0,110.0,360.0,1.0,2
1,2,2,1,0,1,3076,1500,126.0,360.0,1.0,2
2,2,2,2,0,1,5000,1800,208.0,360.0,1.0,2
3,2,2,2,0,1,2340,2546,100.0,360.0,0.0,2
4,2,1,0,1,1,3276,0,78.0,360.0,1.0,2


In [35]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [36]:
model.score(X_test, y_test)

0.7467532467532467

In [37]:
pca = PCA(0.80)
pca.fit(X)
model2 = LogisticRegression()
model2.fit(pca.transform(X_train), y_train)
model2.score(pca.transform(X_test), y_test)

0.7467532467532467

In [38]:
pca.transform(X_train).shape

(460, 8)

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [41]:
pca = PCA(0.95)
pca.fit(X)
model = DecisionTreeClassifier(max_depth=3)
model.fit(pca.transform(X_train), y_train)
model.score(pca.transform(X_test), y_test)

0.7012987012987013

In [45]:
pca = PCA(0.99)
pca.fit(X)
model = RandomForestClassifier(n_estimators=200, max_depth=4)
model.fit(pca.transform(X_train), y_train)
model.score(pca.transform(X_test), y_test)

0.7142857142857143

In [47]:
pca = PCA(0.95)
pca.fit(X)
model = RandomForestClassifier(n_estimators=200, max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7337662337662337

In [48]:
features = SelectKBest(score_func=chi2, k=11)
fit = features.fit(X, y)

ValueError: Input X must be non-negative.