# 4. Preprocessing - Term Deposit Subscription

## Contents
- 4.1 [Introduction](#4.1.Introduction)
- 4.2 [Removing outliers](#4.2.Removingoutliers)
- 4.3 [Converting categorical variables](#4.3.Convertingcategoricalvariables)
- 4.4 [Test train split ](#4.4.Testtrainsplit)
- 4.5 [Scaling Numerical variables](#4.5.Scalingnumericalvariables)
- 4.6 [Saving features](#4.6savingvariables)


### Introduction <a id="4.1.Introduction"></a>

In this phase I'll start by removing the extreme outliers from some of the variables, create a binary values for all the categorical variables perform the train test split and scale the continious variables. Finally I'll save the train and test set into csv files sepatately.

In [1]:
#importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
#Loading the data
data = pd.read_csv("working_data/bank_data_cleaned.csv")
data

Unnamed: 0,age,job,marital,education,credit_default,balance,housing,loan,contact_type,day,month,duration,campaign_contacts,days_passed,previous_contacts,previous_outcome,subscription
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,11,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,11,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,11,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,11,508,4,-1,0,unknown,no


In [3]:
#removing the previously identified variable 'previous_contacts' which has 81.7% missing values 
#and is highly correlated with 'days_passed'
df_w_outliers = data.drop('previous_contacts', axis = 1)

In [4]:
df_w_outliers.shape

(45211, 16)

### Removing outliers <a id="4.2.Removingoutliers"></a>

In [5]:
def remove_outliers(feature, df):
    factor = 4
    upper_lim = df[feature].mean() + df[feature].std() * factor
    lower_lim = df[feature].mean() - df[feature].std() * factor
    
    df = df[(df[feature] < upper_lim) & (df[feature] > lower_lim)]
    
    return df

features = ['campaign_contacts', 'duration', 'balance', 'days_passed']

current_df = df_w_outliers

for feature in features:
    current_df = remove_outliers(feature, current_df)

In [6]:
remove_outliers('campaign_contacts', df_w_outliers)

Unnamed: 0,age,job,marital,education,credit_default,balance,housing,loan,contact_type,day,month,duration,campaign_contacts,days_passed,previous_outcome,subscription
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,11,977,3,-1,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,11,456,2,-1,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,11,1127,5,184,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,11,508,4,-1,unknown,no


In [7]:
current_df.shape

(43667, 16)

### Converting categorical variables <a id="4.3.Convertingcategoricalvariables"></a>

In [8]:
df_num = current_df.select_dtypes(include = 'int')
df_cat = current_df.select_dtypes(include = 'object')

In [9]:
df_cat['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [10]:
df_cat['education'] = df_cat['education'].replace({'primary': 1, 'secondary' : 2, 'tertiary' : 3, 'unknown' : -1})
df_cat['education'].unique()

array([ 3,  2, -1,  1], dtype=int64)

In [11]:
df_with_dummies = pd.get_dummies(df_cat, drop_first=True)
df_with_dummies

Unnamed: 0,education,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,marital_single,credit_default_yes,housing_yes,loan_yes,contact_type_telephone,contact_type_unknown,previous_outcome_other,previous_outcome_success,previous_outcome_unknown,subscription_yes
0,3,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
1,2,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,1,0,0,1,0
2,2,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0
3,-1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,-1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,3,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
45207,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
45208,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
45209,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [12]:
column_mapping = {'credit_default_yes': 'credit_default',
                  'housing_yes': 'housing',
                  'loan_yes': 'loan',
                  'subscription_yes': 'subscription'}

df_with_dummies.rename(columns=column_mapping, inplace=True)

In [13]:
df_with_dummies.shape

(43667, 23)

### Test train split <a id="4.4.Testtrainsplit"></a>

In [14]:
df = df_num.join(df_with_dummies)
df.shape

(43667, 30)

In [15]:
X = df.drop('subscription', axis = 1)
y = df['subscription']

In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


### Scaling numerical variables <a id="4.5.Scalingnumericalvariables"></a>

In [17]:
#scaling features - ['age', 'balance', 'duration', 'campaign_contacts' and 'days_passed']

features_to_scale = [0, 1, 4, 5,6]


scaler = StandardScaler()

X_train.iloc[:, features_to_scale] = scaler.fit_transform(X_train.iloc[:, features_to_scale])
X_test.iloc[:, features_to_scale] = scaler.transform(X_test.iloc[:, features_to_scale])


In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((32750, 29), (10917, 29), (32750,), (10917,))

### Saving variables<a id="4.6savingvariables"></a>

In [19]:

X_train.to_csv('X_train_subscription.csv', index=False)
X_test.to_csv('X_test_subscription.csv', index=False)

pd.DataFrame(y_train, columns=['subscription']).to_csv('y_train_subscription.csv', index=False)
pd.DataFrame(y_test, columns=['subscription']).to_csv('y_test_subscription.csv', index=False)
