## Automated Feature Engineering using Pipelines and Column Transformers BY HARSH SHUKLA

In [2]:
## Importing Important Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings as w
w.filterwarnings(action = 'ignore')

In [3]:
## Reading the DataFrame

df = pd.read_csv(r"C:\Users\hs081\Downloads\loan_approval_dataset.csv")

In [4]:
## Displaying top 5 records from the dataset

df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [6]:
## Data Reduction (Dropping Columns that doesn't paly any role in prediction)

df.drop(['loan_id'] , axis = 1 , inplace = True)

In [81]:
## Displaying the Names of Columns

df.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [17]:
## Column Renaming (Required)

df.columns = df.columns.str.strip()

## Applying Automated FE in the DataFrame

In [18]:
## Dividing our dataset into Dependent and Independent Features

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
## Applying Label Encoder on Output Feature(Loan_Status)

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)

## Here 1 denotes -> Rejection
## Here 0 Denotes -> Approval

In [20]:
## Train Test Spliting our data (it is recommended that do train_test_split first then after apply Feature Engineering)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X , y , test_size = 0.3 , random_state = 0)

In [21]:
X_train.shape , X_test.shape

((2988, 11), (1281, 11))

In [23]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [25]:
## Libraries Requried for Automated Feature Engineering

from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [26]:
## 1st Categorical Pipeline that handles Nominal categorical variable (handling Null values , and applying OHE on Nominal Varaibles)

categorical_pipeline = Pipeline([
    ('cat_imputer' , SimpleImputer(strategy='most_frequent', add_indicator=False)),
    ('cat_encoder' , OneHotEncoder(sparse = False , dtype = np.int32 , handle_unknown='ignore'))  
])

## 2nd Numerical Pipeline that handles numerical variable (handling Null values , and Standardize the featuers)

Numerical_pipeline = Pipeline([
    ('num_imputer' , SimpleImputer(strategy='median')),
    ('num_scaling' , StandardScaler())
])

In [27]:
## We use ColumnTransformers to Join all the Pipelines Sequently 

preprocessor = ColumnTransformer([
    ('categorical_pipeline' , categorical_pipeline,['education' , 'self_employed']),
    ('Numerical_pipeline' , Numerical_pipeline , ['no_of_dependents','income_annum','loan_amount','loan_term' ,'cibil_score','residential_assets_value', 'commercial_assets_value', 'luxury_assets_value' , 'bank_asset_value']),
])

In [28]:
## To See the Pipeline (how it works Visually)

from sklearn import set_config

set_config(display = 'diagram')

preprocessor

In [29]:
## Transform the Data (train and test) using preprocessor columntransformer 

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [30]:
## ALl FE Completed 

X_train_transformed.shape , X_test_transformed.shape

((2988, 13), (1281, 13))

In [31]:
X_train_transformed

array([[ 0.        ,  1.        ,  0.        , ..., -0.92170604,
        -1.21764827, -1.05013531],
       [ 0.        ,  1.        ,  1.        , ...,  0.23806953,
        -0.7435998 , -0.62128939],
       [ 0.        ,  1.        ,  0.        , ...,  0.92029045,
         1.16361846,  1.40041278],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  1.19317882,
         1.11952093,  0.72651206],
       [ 0.        ,  1.        ,  1.        , ..., -0.58059558,
         0.64547246,  0.48145725],
       [ 0.        ,  1.        ,  0.        , ...,  0.57917999,
         0.09425331, -0.56002569]])

## Model Traning Steps 

(1). Importing All Important Algorithms so , that we can get the best accuracy.

(2). Giving transformed data to all the Algorithms to get the best accuracy.

(3). I apply a for loop for doing step 2nd


In [32]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [33]:
## AlL Important Algorithms used for Model Traning 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xg

In [34]:
## Evaluation Metrices 

from sklearn.metrics import accuracy_score , precision_score , recall_score , classification_report , confusion_matrix

In [35]:
## Function to Evaluate the model with respect to each Algorithm

def evaluate_model(y_test , y_pred):
    cm = confusion_matrix(y_pred , y_test)
    score = accuracy_score(y_pred,y_test)
    cr = classification_report(y_pred, y_test)
    
    return cm , score, cr

In [36]:
## Evaulating Each Model with respect to Transformed Data 

models={
    'LogisticRegression Algorithm':LogisticRegression(),
    'RandomForest Algorithm': RandomForestClassifier(),
    'GradientBoost Algorithm' : GradientBoostingClassifier(),
    'DecisionTree Alogrithm' : DecisionTreeClassifier(),
    'SVC Algorithm' : SVC(),
    'xgboost Algorithm' : xg.XGBClassifier()
}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train_transformed,y_train)

    #Make Predictions
    y_pred=model.predict(X_test_transformed)

    cm, score, cr = evaluate_model(y_test,y_pred)
    print('='*77)
    print('\n')

    print(list(models.keys())[i])
    print()

    print('Model Testing Performance')
    print()
    print(cm)
    print(score * 100)
    print(cr)

    
    print('='*77)
    print('\n')




LogisticRegression Algorithm

Model Testing Performance

[[730  62]
 [ 45 444]]
91.64715066354411
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       792
           1       0.88      0.91      0.89       489

    accuracy                           0.92      1281
   macro avg       0.91      0.91      0.91      1281
weighted avg       0.92      0.92      0.92      1281





RandomForest Algorithm

Model Testing Performance

[[768  19]
 [  7 487]]
97.97033567525371
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       787
           1       0.96      0.99      0.97       494

    accuracy                           0.98      1281
   macro avg       0.98      0.98      0.98      1281
weighted avg       0.98      0.98      0.98      1281





GradientBoost Algorithm

Model Testing Performance

[[769  19]
 [  6 487]]
98.04839968774395
              precision    recall  f1-score   support

## Observations 

we train our model with many models but the best model that give accuracy of 98.28 is XGBoostClassifier

## Joining Transformation and Prediction Pipeline using Pipeline 

In [38]:
## This Final_pipe pipeline first do All Feature engineering then after all transformation it will do model traning uing XGBoostClassifier algo
xgbc = xg.XGBClassifier()

final_pipe = Pipeline([
    ("Tranformation" , preprocessor),
    ("Prediction" , xgbc)
])

In [39]:
final_pipe.fit(X_train , y_train)

In [40]:
## this is the predicted Data 

y_pred = final_pipe.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 1])

## Lets Do Hyper Parameter Tunning  Using Grid Search CV

In [53]:
# A parameter grid for XGBoost

params = [ 
    {
    'Prediction__max_depth': [3, 5, 7 , 8 , 10],
    'Prediction__learning_rate': [0.1, 0.01, 0.001, 0.05 , 0.2],
    'Prediction__subsample': [0.5, 0.7, 1 , 2]
    }


]

In [57]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(final_pipe , params , cv = 10 , scoring = 'accuracy')
grid.fit(X_train , y_train)

In [58]:
grid.best_params_

{'Prediction__learning_rate': 0.05,
 'Prediction__max_depth': 10,
 'Prediction__subsample': 1}

In [59]:
grid.best_score_

0.98527642477161

##### by Applying GridSearch CV our Accuracy Increases so we train our model by applying the best parameters given by gridsearchCV

In [60]:
## Again Traning of Model

## This Final_pipe pipeline first do All Feature engineering then after all transformation it will do model traning uing XGBoostClassifieralgo
xgbc = xg.XGBClassifier(learning_rate = 0.05 , max_depth = 10, subsample = 1)
final_pipe = Pipeline([
    ("Tranformation" , preprocessor),
    ("Prediction" , xgbc)
])

In [61]:
## Again Traning our model

final_pipe.fit(X_train , y_train)

In [62]:
## this is the predicted Data 

y_pred = final_pipe.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1])

### Saving Our Model

In [64]:
import pickle

pickle.dump(final_pipe , open('pipe.pkl' , 'wb'))

In [65]:
pipe = pickle.load(open('pipe.pkl' , 'rb'))

In [66]:
pipe

In [77]:
df.sample()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
2303,2,Graduate,Yes,8400000,32600000,10,384,13900000,1600000,21600000,7000000,Rejected


## Testing Our Model

In [79]:
input_test = np.array([2, 'Graduate','Yes' , 8400000 , 32600000 , 10 , 384 , 13900000 , 1600000 , 21600000 , 7000000], dtype=object).reshape(1, 11)

# Convert the NumPy array to a DataFrame with appropriate column names
columns = ['no_of_dependents' , 'education' , 'self_employed' , 'income_annum' , 'loan_amount' , 'loan_term' , 'cibil_score' , 'residential_assets_value' , 'commercial_assets_value' , 'luxury_assets_value' , 'bank_asset_value']
input_test_df = pd.DataFrame(input_test, columns=columns)

# Use the predict method on the DataFrame
pipe.predict(input_test_df)

## 1 means -> Rejected 
## 0 means -> Accepted

array([1])