# 4. Preprocessing - Term Deposit Subscription

## Contents
- 4.1 [Introduction](#4.1.Introduction)
- 4.2 [Test train split ](#4.2.Testtrainsplit)
- 4.3 [Building a pipeline](#4.3.buildapipeline)
- 4.4 [Saving features](#4.4savingvariables)


### Introduction <a id="4.1.Introduction"></a>

In this phase, first I'll perform the train test split. Then,  I'll create a pipeline to remove the extreme outliers from some of the variables, create a binary values for all the categorical variables and scale the numerical variables. Finally I'll save the train and test set into csv files sepatately.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

# Load new/unseen data

df = pd.read_csv('./working_data/bank_data_cleaned.csv')
df

Unnamed: 0,age,job,marital,education,credit_default,balance,housing,loan,contact_type,day,month,duration,campaign_contacts,days_passed,previous_contacts,previous_outcome,subscription
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,11,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,11,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,11,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,11,508,4,-1,0,unknown,no


### Test train split <a id="4.2.Testtrainsplit"></a>

In [3]:
X = df.drop('subscription', axis = 1)
y = df['subscription']

In [4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


### Building a pipeline <a id="4.3.buildapipeline"></a>

In [6]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        factor = 4
        features = ['campaign_contacts', 'duration', 'balance', 'days_passed']
        for feature in features:
            upper_lim = X[feature].mean() + X[feature].std() * factor
            lower_lim = X[feature].mean() - X[feature].std() * factor
            X = X[(X[feature] < upper_lim) & (X[feature] > lower_lim)]
        return X
    

    
class OrdinalCategorical(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        X['education'] = X['education'].replace({'primary': 1, 'secondary' : 2, 'tertiary' : 3, 'unknown' : -1})
        return X
    
    
    
class DummyAndRename(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        X_num = X.select_dtypes(include = 'int')
        X_cat = X.select_dtypes(include = 'object')
        X_with_dummies = pd.get_dummies(X_cat, drop_first=True)
        
        
        column_mapping = {'credit_default_yes': 'credit_default',
                  'housing_yes': 'housing',
                  'loan_yes': 'loan',
                  'subscription_yes': 'subscription'}
        X_with_dummies.rename(columns=column_mapping, inplace=True)
        X_with_dummies = X_with_dummies.astype(int)
        
        X = X_num.join(X_with_dummies)
        return X
                       
                       
                       
class Scaling(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
                  
    def transform(self, X):
        features_to_scale = ['age', 'balance', 'duration', 'campaign_contacts', 'days_passed']
        scaler = StandardScaler()
        X[features_to_scale] = scaler.fit_transform(X[features_to_scale])
        return X


In [7]:
outliers = RemoveOutliers()
ordinal = OrdinalCategorical()
dummy = DummyAndRename()
scale = Scaling()

pipeline = Pipeline([
    ('outliers', outliers),
    ('ordinal', ordinal),
    ('dummy', dummy),
    ('scale', scale)
])

In [11]:
preprocessed_X_train = pipeline.fit_transform(X_train)
preprocessed_X_test = pipeline.fit_transform(X_test)

In [9]:
preprocessed_X_train

Unnamed: 0,age,education,balance,day,month,duration,campaign_contacts,days_passed,previous_contacts,job_blue-collar,...,marital_married,marital_single,credit_default,housing,loan,contact_type_telephone,contact_type_unknown,previous_outcome_other,previous_outcome_success,previous_outcome_unknown
41626,1.045165,2,-0.537081,25,9,-0.659685,-0.711283,-0.417933,0,0,...,1,0,0,0,0,0,0,0,0,1
31347,-1.216102,2,-0.360785,16,3,-0.763284,4.853443,-0.417933,0,0,...,0,1,0,0,0,0,0,0,0,1
22563,-0.462346,3,-0.517435,22,8,-0.579631,-0.711283,-0.417933,0,0,...,0,1,0,0,0,0,0,0,0,1
37243,1.327823,3,-0.431097,13,5,-0.212324,-0.711283,-0.417933,0,0,...,1,0,0,1,1,0,0,0,0,1
32259,-0.273908,2,-0.130722,16,4,0.545834,-0.711283,3.162924,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,-0.839224,2,-0.591366,27,5,-0.739739,-0.247556,-0.417933,0,0,...,0,1,0,1,0,0,1,0,0,1
44732,-1.687199,3,-0.335453,8,9,-0.165234,-0.711283,0.561536,1,0,...,0,1,0,0,0,0,0,0,0,0
38158,-0.650785,3,0.082797,15,5,-0.028671,-0.711283,-0.417933,0,0,...,0,0,0,1,0,0,0,0,0,1
860,-0.745005,2,-0.512782,7,5,-0.631430,-0.711283,-0.417933,0,0,...,1,0,0,0,0,0,1,0,0,1


In [10]:
preprocessed_X_train.shape

(32742, 30)

In [13]:
preprocessed_X_test.shape

(10903, 30)

### Saving variables<a id="4.4savingvariables"></a>

In [19]:

preprocessed_X_train.to_csv('X_train_preprocessed_subscription.csv', index=False)
preprocessed_X_test.to_csv('X_test_preprocessed_subscription.csv', index=False)

pd.DataFrame(y_train, columns=['subscription']).to_csv('y_train_subscription.csv', index=False)
pd.DataFrame(y_test, columns=['subscription']).to_csv('y_test_subscription.csv', index=False)
