<a href="https://colab.research.google.com/github/KTSNVaishnavi/Machine-Learning/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [69]:
#importing necessary packages
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [51]:
#Reading the data
data=pd.read_csv('/content/gdrive/MyDrive/MachineLearning/SVM/UnivBank.csv', na_values=['?','#'])

In [52]:
#Seeing the data
data.head(10)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0.0,0,1.0,0.0,0,0
1,2,45,19,34,90089,3,1.5,1,0.0,0,1.0,0.0,0,0
2,3,39,15,11,94720,1,1.0,1,0.0,0,0.0,0.0,0,0
3,4,35,9,100,94112,1,2.7,2,0.0,0,0.0,,0,0
4,5,35,8,45,91330,4,1.0,2,0.0,0,0.0,0.0,0,1
5,6,37,13,29,92121,4,0.4,2,155.0,0,0.0,0.0,1,0
6,7,53,27,72,91711,2,1.5,2,0.0,0,0.0,0.0,1,0
7,8,50,24,22,93943,1,0.3,3,0.0,0,0.0,0.0,0,1
8,9,35,10,81,90089,3,0.6,2,104.0,0,0.0,0.0,1,0
9,10,34,9,180,93023,1,8.9,3,0.0,1,0.0,0.0,0,0


In [53]:
data.tail()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
4995,4996,29,3,40,92697,1,1.9,3,0.0,0,0.0,0.0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85.0,0,0.0,0.0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0.0,0,0.0,0.0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0.0,0,0.0,0.0,1,0
4999,5000,28,4,83,92612,3,0.8,1,0.0,0,0.0,0.0,1,1


In [54]:
data.shape

(5000, 14)

In [55]:
data.describe(include='all')

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,4998.0,5000.0,4998.0,4999.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.521409,0.096,0.104442,0.060412,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.727873,0.294621,0.305863,0.238273,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [56]:
data.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage              float64
Personal Loan           int64
Securities Account    float64
CD Account            float64
Online                  int64
CreditCard              int64
dtype: object

In [57]:
data.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [58]:
#Train_Test split
X_train, X_test,y_train,y_test=train_test_split(data.loc[:,data.columns!='Personal Loan'], data['Personal Loan'], test_size=0.3, random_state=100)

In [59]:
#Data preprocessing
#Step1. Dropping the unnecessary columns
dropcols=['ID','ZIP Code']
X_train=X_train.drop(dropcols,axis=1)
X_test=X_test.drop(dropcols,axis=1)


In [60]:
#Step2. Separating the numerical and categorical columns
cat=['Family','Education','Securities Account','CD Account','Online', 'CreditCard' ]
num=list(X_train.columns.difference(cat))

In [61]:
X_train[cat]=X_train[cat].astype('category')
X_test[cat]=X_test[cat].astype('category')

In [62]:
si_num=SimpleImputer(strategy='mean')
si_cat=SimpleImputer(strategy='most_frequent')
std=StandardScaler()
ohe=OneHotEncoder(handle_unknown='ignore')

In [63]:
num_trans=Pipeline(steps=[('num_imputer',si_num),('num_stand',std)])
cat_trans=Pipeline(steps=[('cat_imputer',si_cat),('cat_encode',ohe)])


In [64]:
prepr=ColumnTransformer(transformers=[('num_trans',num_trans,num),('cat_trans',cat_trans,cat)])

In [65]:
classifier=DecisionTreeClassifier(class_weight='balanced')
clf=Pipeline(steps=[('Preprocess',prepr),('Classifier',classifier)])

In [66]:
clf.fit(X_train,y_train)

Pipeline(steps=[('Preprocess',
                 ColumnTransformer(transformers=[('num_trans',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer()),
                                                                  ('num_stand',
                                                                   StandardScaler())]),
                                                  ['Age', 'CCAvg', 'Experience',
                                                   'Income', 'Mortgage']),
                                                 ('cat_trans',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('cat_encode',
                                                                   OneHotEncoder

In [68]:
train_pred=clf.predict(X_train)
test_pred=clf.predict(X_test)

In [70]:
print(confusion_matrix(train_pred,y_train))
print(confusion_matrix(test_pred,y_test))

[[3178    0]
 [   0  322]]
[[1333   17]
 [   9  141]]


In [71]:
print(classification_report(train_pred,y_train))
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3178
           1       1.00      1.00      1.00       322

    accuracy                           1.00      3500
   macro avg       1.00      1.00      1.00      3500
weighted avg       1.00      1.00      1.00      3500

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1350
           1       0.89      0.94      0.92       150

    accuracy                           0.98      1500
   macro avg       0.94      0.96      0.95      1500
weighted avg       0.98      0.98      0.98      1500

