## Create Data_Prep transformer preparing raw data for training

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder

import os

In [2]:
churners = pd.read_csv(os.path.join(os.environ['PWD'],'data/BankChurners.csv'))

In [3]:
class Data_Prep(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.cat_features = ['Gender', 'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Total_Relationship_Count',
       'Months_Inactive_12_mon', 'Contacts_Count_12_mon']
        self.num_features = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
    
    def fit(self, X, y=None):
        X = X.copy()
        
        # Treating categorical variables
        self.OneHotEncoder = OneHotEncoder(drop='if_binary', sparse=False)
        
        self.OneHotEncoder.fit(X[self.cat_features])
        
        # Feature engineering
        self.feature_names = self.num_features + ['Income_Credit_ratio'] + self.OneHotEncoder.get_feature_names_out().tolist() 
        
        return self 
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X.drop('CLIENTNUM',axis=1)
        
        # Feature engineering
        Income_Credit_ratio_feature = X.Income_Category.map({'$120K +': 140, '$40K - $60K': 50, '$60K - $80K': 70, 
                                                                    '$80K - $120K': 100,'Less than $40K': 20, 
                                                                    'Unknown': 0}).astype('int') / X.Credit_Limit
        # Treating categorical variables
        cat_features = self.OneHotEncoder.transform(X[self.cat_features])
        
        num_features = X[self.num_features]

        X = np.concatenate((num_features,Income_Credit_ratio_feature.to_numpy()[:,np.newaxis],cat_features), axis=1)
        
        return X

In [4]:
pre_process = Data_Prep()
pre_process.fit(churners.drop('Attrition_Flag',axis=1))
pd.DataFrame(data=pre_process.transform(churners.drop('Attrition_Flag',axis=1)), columns=pre_process.feature_names)

Unnamed: 0,Customer_Age,Months_on_book,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,...,Months_Inactive_12_mon_4,Months_Inactive_12_mon_5,Months_Inactive_12_mon_6,Contacts_Count_12_mon_0,Contacts_Count_12_mon_1,Contacts_Count_12_mon_2,Contacts_Count_12_mon_3,Contacts_Count_12_mon_4,Contacts_Count_12_mon_5,Contacts_Count_12_mon_6
0,45.0,39.0,12691.0,777.0,11914.0,1.335,1144.0,42.0,1.625,0.061,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,49.0,44.0,8256.0,864.0,7392.0,1.541,1291.0,33.0,3.714,0.105,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,51.0,36.0,3418.0,0.0,3418.0,2.594,1887.0,20.0,2.333,0.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,34.0,3313.0,2517.0,796.0,1.405,1171.0,20.0,2.333,0.760,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,40.0,21.0,4716.0,0.0,4716.0,2.175,816.0,28.0,2.500,0.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,50.0,40.0,4003.0,1851.0,2152.0,0.703,15476.0,117.0,0.857,0.462,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10123,41.0,25.0,4277.0,2186.0,2091.0,0.804,8764.0,69.0,0.683,0.511,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10124,44.0,36.0,5409.0,0.0,5409.0,0.819,10291.0,60.0,0.818,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10125,30.0,36.0,5281.0,0.0,5281.0,0.535,8395.0,62.0,0.722,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
Data_Prep_code = """
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class Data_Prep(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.cat_features = ['Gender', 'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Total_Relationship_Count',
       'Months_Inactive_12_mon', 'Contacts_Count_12_mon']
        self.num_features = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
    
    def fit(self, X, y=None):
        X = X.copy()
        
        self.OneHotEncoder = OneHotEncoder(drop='if_binary', sparse=False)
        
        self.OneHotEncoder.fit(X[self.cat_features])
        
        self.feature_names = self.num_features + ['Income_Credit_ratio'] + self.OneHotEncoder.get_feature_names_out().tolist() 
        
        return self 
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X.drop('CLIENTNUM',axis=1)
        
        Income_Credit_ratio_feature = X.Income_Category.map({'$120K +': 140, '$40K - $60K': 50, '$60K - $80K': 70, 
                                                                    '$80K - $120K': 100,'Less than $40K': 20, 
                                                                    'Unknown': 0}).astype('int') / X.Credit_Limit
        
        cat_features = self.OneHotEncoder.transform(X[self.cat_features])
        
        num_features = X[self.num_features]

        X = np.concatenate((num_features,Income_Credit_ratio_feature.to_numpy()[:,np.newaxis],cat_features), axis=1)
        
        return X
"""

In [6]:
f = open(os.path.join(os.environ['PWD'],'scripts/Data_Prep.py'), "w")
f.write(Data_Prep_code)
f.close()

In [7]:
import sys

sys.path.append( os.path.join(os.environ['PWD'],'scripts'))
from Data_Prep import Data_Prep