# 4. Preprocessing - Term Deposit Subscription

## Contents
- 4.1 [Introduction](#4.1.Introduction)
- 4.2 [Test train split ](#4.2.Testtrainsplit)
- 4.3 [Building a pipeline](#4.3.buildapipeline)
- 4.4 [Saving features](#4.4savingvariables)


### Introduction <a id="4.1.Introduction"></a>

In this phase, first I'll perform the train test split. Then,  I'll create a pipeline to remove the extreme outliers from some of the variables, create a binary values for all the categorical variables and scale the numerical variables. Finally I'll save the train and test set into csv files sepatately.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load new/unseen data

df = pd.read_csv('./working_data/bank_data_cleaned.csv')
df

Unnamed: 0,age,job,marital,education,credit_default,balance,housing,loan,contact_type,day,month,duration,campaign_contacts,days_passed,previous_contacts,previous_outcome,subscription
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,11,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,11,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,11,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,11,508,4,-1,0,unknown,no


### Building a pipeline <a id="4.3.buildapipeline"></a>

In [2]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        factor = 4
        features = ['campaign_contacts', 'duration', 'balance', 'days_passed']
        for feature in features:
            upper_lim = X[feature].mean() + X[feature].std() * factor
            lower_lim = X[feature].mean() - X[feature].std() * factor
            X = X[(X[feature] < upper_lim) & (X[feature] > lower_lim)]
        return X
    

    
class OrdinalCategorical(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        X['education'] = X['education'].replace({'primary': 1, 'secondary' : 2, 'tertiary' : 3, 'unknown' : -1})
        return X
    
    
    
class DummyAndRename(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        X_num = X.select_dtypes(include = 'int')
        X_cat = X.select_dtypes(include = 'object')
        X_with_dummies = pd.get_dummies(X_cat, drop_first=True)
        
        
        column_mapping = {'credit_default_yes': 'credit_default',
                  'housing_yes': 'housing',
                  'loan_yes': 'loan',
                  'subscription_yes': 'subscription'}
        X_with_dummies.rename(columns=column_mapping, inplace=True)
        X_with_dummies = X_with_dummies.astype(int)
        
        X = X_num.join(X_with_dummies)
        return X
                       
                       
                       
class Scaling(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        features_to_scale = ['age', 'balance', 'duration', 'campaign_contacts', 'days_passed']
        scaler = StandardScaler()
        X[features_to_scale] = scaler.fit_transform(X[features_to_scale])
        return X


In [3]:
outliers = RemoveOutliers()
ordinal = OrdinalCategorical()
dummy = DummyAndRename()
scale = Scaling()

pipeline = Pipeline([
    ('outliers', outliers),
    ('ordinal', ordinal),
    ('dummy', dummy),
    ('scale', scale)
])

In [4]:
preprocessed_data= pipeline.fit_transform(df)

Unnamed: 0,age,education,balance,day,month,duration,campaign_contacts,days_passed,previous_contacts,job_blue-collar,...,marital_single,credit_default,housing,loan,contact_type_telephone,contact_type_unknown,previous_outcome_other,previous_outcome_success,previous_outcome_unknown,subscription
0,1.611385,3,0.506548,5,5,0.068743,-0.711987,-0.418362,0,0,...,0,0,1,0,0,1,0,0,1,0
1,0.291370,2,-0.582956,5,5,-0.445279,-0.711987,-0.418362,0,0,...,1,0,1,0,0,1,0,0,1,0
2,-0.745785,2,-0.596871,5,5,-0.795748,-0.711987,-0.418362,0,0,...,0,0,1,1,0,1,0,0,1,0
3,0.574230,-1,0.178254,5,5,-0.720981,-0.711987,-0.418362,0,1,...,0,0,1,0,0,1,0,0,1,0
4,-0.745785,-1,-0.597386,5,5,-0.225652,-0.711987,-0.418362,0,0,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.951377,3,-0.172717,17,11,3.414555,0.215377,-0.418362,0,0,...,0,0,0,0,0,0,0,0,1,1
45207,2.837113,1,0.293183,17,11,0.979963,-0.248305,-0.418362,0,0,...,0,0,0,0,0,0,0,0,1,1
45208,2.931400,2,2.347471,17,11,4.115494,1.142740,1.529158,3,0,...,0,0,0,0,0,0,0,1,0,1
45209,1.517098,2,-0.253631,17,11,1.222955,0.679058,-0.418362,0,1,...,0,0,0,0,1,0,0,0,1,0


In [6]:
preprocessed_data.shape

(43667, 31)

In [7]:
#Addressing the imbalance
X_imb = preprocessed_data.drop('subscription', axis = 1)
y_imb = preprocessed_data['subscription']

In [8]:
resampler = RandomOverSampler()
X , y = resampler.fit_resample(X_imb, y_imb)

### Test train split <a id="4.2.Testtrainsplit"></a>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
#Checking the shapes
X_train.shape

(58165, 30)

In [12]:
X_test.shape

(19389, 30)

In [13]:
y_train.shape

(58165,)

In [14]:
y_test.shape

(19389,)

### Saving variables<a id="4.4savingvariables"></a>

In [15]:

X_train.to_csv('X_train_subscription.csv', index=False)
X_test.to_csv('X_test_subscription.csv', index=False)

pd.DataFrame(y_train, columns=['subscription']).to_csv('y_train_subscription.csv', index=False)
pd.DataFrame(y_test, columns=['subscription']).to_csv('y_test_subscription.csv', index=False)
