# Basic ML Model Deployment

## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

## Fetch Data

In [2]:
#data=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/master/Loan_data_ver2.csv')
data=pd.read_csv('loan_dataset.csv',
                 dtype  = {'Married': str,
                     'Education': str,
                     'ApplicantIncome': 'Int64',
                     'LoanAmount': 'Int64',
                     'Credit_History': 'Int64',} )


## Explore Data

In [3]:
data.shape

(614, 6)

In [4]:
data.columns

Index(['Married', 'Education', 'ApplicantIncome', 'LoanAmount',
       'Credit_History', 'Loan_Status'],
      dtype='object')

In [5]:
data.dtypes

Married             object
Education           object
ApplicantIncome      Int64
LoanAmount           Int64
Credit_History       Int64
Loan_Status        float64
dtype: object

In [6]:
data.head(2)

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,No,Graduate,5849,,1,0.1
1,Yes,Graduate,4583,128.0,1,0.32


In [7]:
# fetch features with missing values
data.isnull().sum()

Married             3
Education           0
ApplicantIncome     0
LoanAmount         22
Credit_History     50
Loan_Status         0
dtype: int64

3 features namely - Married,LoanAmount,Credit_History has missing values

In [8]:
data['Married'].value_counts()


Married
Yes    398
No     213
Name: count, dtype: int64

In [9]:
data['Education'].value_counts()

Education
Graduate        449
Not Graduate    127
HSC              38
Name: count, dtype: int64

In [10]:
# segreegating target & feature
X=data.drop('Loan_Status', axis=1)
y=data['Loan_Status']

In [11]:
# spliting data into train & validation set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=48)

In [12]:
# fetching numeric features list
feat_num=list(X.select_dtypes(include=np.number).columns)
feat_num

['ApplicantIncome', 'LoanAmount', 'Credit_History']

In [13]:
# fetching categorical features  list
feat_cat=list(X.select_dtypes(exclude=np.number).columns)
feat_cat

['Married', 'Education']

## Defining Data processing & Modeling  Pipeline

In [14]:
num_pipe=Pipeline([('imputer',SimpleImputer(strategy='mean')),('std_scale',StandardScaler())])

num_pipe

In [15]:
# pipeline for categorical faetures - missing category replacement by new category i.e. missing followed by one hot encoding 
feat_pipe = Pipeline([('imputer',SimpleImputer(strategy='constant', fill_value='Missing')), 
                      ('one_hot',(OneHotEncoder()))]) 

feat_pipe

In [16]:
#combine data processing pipeline
data_pipeline=ColumnTransformer([('numeric',num_pipe,feat_num),
                                 ('categorical',feat_pipe, feat_cat)],
                                  remainder='passthrough')




data_pipeline

In [17]:
# adding ml-model into pipeline 
full_pipe=Pipeline([('pre_process',data_pipeline),('model',RandomForestRegressor())])

In [18]:
# training
full_pipe.fit(X_train,y_train)

In [19]:
# prediction
full_pipe.predict(X_test)

array([0.1048, 0.1924, 0.4334, 0.4839, 0.1844, 0.0932, 0.9601, 0.2536,
       0.1885, 0.2821, 0.2043, 0.9771, 0.2887, 0.1213, 0.0516, 0.1339,
       0.1178, 0.0403, 0.0572, 0.1701, 0.3159, 0.073 , 0.3255, 0.5032,
       0.5187, 0.1441, 0.2715, 0.2135, 0.2643, 0.0115, 0.0977, 0.2203,
       0.4042, 0.9502, 0.1232, 0.2737, 0.2216, 0.601 , 0.9503, 0.2144,
       0.315 , 0.1424, 0.2753, 0.3378, 0.3319, 0.4027, 0.1329, 0.1528,
       0.261 , 0.2103, 0.1151, 0.3537, 0.0685, 0.1043, 0.9641, 0.0493,
       0.4514, 0.2854, 0.2517, 0.2031, 0.3339, 0.7568, 0.2789, 0.1516,
       0.4638, 0.2468, 0.0809, 0.2714, 0.1621, 0.2415, 0.3867, 0.0891,
       0.2614, 0.3686, 0.0803, 0.8655, 0.0868, 0.366 , 0.6487, 0.0608,
       0.2693, 0.1006, 0.3251, 0.7602, 0.4464, 0.1346, 0.0834, 0.1215,
       0.0549, 0.5115, 0.0599, 0.1808, 0.2413, 0.5095, 0.4874, 0.1649,
       0.9723, 0.1101, 0.0789, 0.3121, 0.4872, 0.0839, 0.0055, 0.0338,
       0.1022, 0.1225, 0.3375, 0.2194, 0.0219, 0.1641, 0.1567, 0.1077,
      

In [20]:
## can store numeric and categorical variables also as pickle file
# pickle.dump(feat_num,open('feat_numv1','wb'))
# pickle.dump(feat_cat,open('feat_catv1','wb'))

 

## Store the model as pickle file 

In [21]:
#pickle.dump(full_pipe,open('full_pipeline.pkl','wb'))
joblib.dump(full_pipe, 'full_pipeline.pkl')

['full_pipeline.pkl']