In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import category_encoders as ce
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import accuracy_score , confusion_matrix

In [2]:
df = pd.read_csv("Loan_Dataset.xlsx - Sheet1.csv")

In [3]:
df.head()

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,Male,0.0,Graduate,No,5849,,1.0,Urban,Y
1,Male,1.0,Graduate,No,4583,128.0,1.0,Rural,N
2,Male,0.0,Graduate,Yes,3000,66.0,1.0,Urban,Y
3,Male,0.0,Not Graduate,No,2583,120.0,1.0,Urban,Y
4,Male,0.0,Graduate,No,6000,141.0,1.0,Urban,Y


In [4]:
df.isnull().mean()*100>0

Gender              True
Dependents          True
Education          False
Self_Employed       True
ApplicantIncome    False
LoanAmount          True
Credit_History      True
Property_Area      False
Loan_Status        False
dtype: bool

In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
trf1 = ColumnTransformer([
    ('impute_gender',SimpleImputer(strategy='most_frequent'),[0]),
    ('impute_Dependents',SimpleImputer(strategy='most_frequent'),[1]),
    ('impute_Self_Employed',SimpleImputer(strategy='most_frequent'),[3]),
    ('impute_Credit_History',SimpleImputer(strategy='most_frequent'),[6]),
    ('impute_LoanAmount',SimpleImputer(strategy='mean'),[5])
],remainder='passthrough')

In [7]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_gender',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[0,2,5]),
    ('ordinal_property',OrdinalEncoder(categories=[['Rural','Urban','Semiurban']]),[7])
],remainder='passthrough')

In [8]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(9)),
    ('standard',StandardScaler(),slice(8,10))
],remainder='passthrough')

In [9]:
clf = LogisticRegression()

# PIPELINE

In [10]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('clf',clf)
])

In [11]:
pipe.fit(X_train,y_train)

In [12]:
# Predict
y_pred = pipe.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8048780487804879

# CROSS VALIDATION

In [44]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.8086167800453514

# Pikle File

In [45]:
# export 
import pickle
pickle.dump(pipe,open('loan_file.pkl','wb'))

In [48]:
X

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area
0,Male,0.0,Graduate,No,5849,,1.0,Urban
1,Male,1.0,Graduate,No,4583,128.0,1.0,Rural
2,Male,0.0,Graduate,Yes,3000,66.0,1.0,Urban
3,Male,0.0,Not Graduate,No,2583,120.0,1.0,Urban
4,Male,0.0,Graduate,No,6000,141.0,1.0,Urban
...,...,...,...,...,...,...,...,...
609,Female,0.0,Graduate,No,2900,71.0,1.0,Rural
610,Male,3.0,Graduate,No,4106,40.0,1.0,Rural
611,Male,1.0,Graduate,No,8072,253.0,1.0,Urban
612,Male,2.0,Graduate,No,7583,187.0,1.0,Urban
