In [1]:
import pandas as pd
from pickle import dump

In [2]:
train_df = pd.read_csv("../artifacts/cleaned data/cleaned.csv")
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,0,unknown,no


In [3]:
categorical_columns = train_df.select_dtypes('object').columns
numerical_columns = train_df.select_dtypes('int').columns

In [4]:
# age -  - min-max scaler (specific range data)
# balance,duration (reducing the impact of dominating features like balance and duration)
# campaign, previous - log- transformation (count data & skewed data)

from sklearn.preprocessing import MinMaxScaler
import numpy as np

min_max_scalers = {
    "age":MinMaxScaler(),
    "balance":MinMaxScaler(),
    "duration":MinMaxScaler(),
}

for feature,min_max_scaler in min_max_scalers.items():
    min_max_scaler.fit(train_df[[feature]])
    train_df[feature] = min_max_scaler.transform(train_df[[feature]])
    dump(min_max_scaler,open(f"../artifacts/min-max-scalers/{feature}.pkl","wb"))


train_df["campaign"] = np.log1p(train_df["campaign"])
train_df["previous"] = np.log1p(train_df["previous"])


In [5]:
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,poutcome,y
0,0.769231,management,married,tertiary,no,0.760385,yes,no,unknown,5,may,0.411672,0.693147,0.0,unknown,no
1,0.5,technician,single,secondary,no,0.361216,yes,no,unknown,5,may,0.23817,0.693147,0.0,unknown,no
2,0.288462,entrepreneur,married,secondary,no,0.356118,yes,yes,unknown,5,may,0.119874,0.693147,0.0,unknown,no
3,0.557692,blue-collar,married,unknown,no,0.640106,yes,no,unknown,5,may,0.14511,0.693147,0.0,unknown,no
4,0.288462,unknown,single,unknown,no,0.355929,no,no,unknown,5,may,0.312303,0.693147,0.0,unknown,no


In [6]:
for column in categorical_columns:
    print(f"feature = {column}, total categories = {train_df[column].nunique()}")
    print(f"{train_df[column].unique()}")
    print("="*100)

feature = job, total categories = 12
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
feature = marital, total categories = 3
['married' 'single' 'divorced']
feature = education, total categories = 4
['tertiary' 'secondary' 'unknown' 'primary']
feature = default, total categories = 2
['no' 'yes']
feature = housing, total categories = 2
['yes' 'no']
feature = loan, total categories = 2
['no' 'yes']
feature = contact, total categories = 3
['unknown' 'cellular' 'telephone']
feature = month, total categories = 12
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']
feature = poutcome, total categories = 3
['unknown' 'failure' 'success']
feature = y, total categories = 2
['no' 'yes']


In [7]:
from sklearn.preprocessing import OneHotEncoder
from category_encoders.cat_boost import CatBoostEncoder

one_hot_encoders = {
    "marital":OneHotEncoder(drop='first',sparse_output=False),
    "contact":OneHotEncoder(drop='first',sparse_output=False)  
}

train_df["default"] = train_df["default"].map({'no':0,"yes":1})
train_df["housing"] = train_df["housing"].map({'no':0,"yes":1})
train_df["loan"] = train_df["loan"].map({'no':0,'yes':1})
train_df["y"] = train_df["y"].map({'no':0,'yes':1})


train_df["education"] = train_df["education"].map({'unknown':-1,'primary':1, 'secondary':2,'tertiary':3})
train_df["poutcome"] = train_df["poutcome"].map({"unknown":-1,"failure":0,"success":1})

month_order = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
               'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

train_df['month'] = train_df['month'].map(month_order)
train_df['month'] = np.sin(2 * np.pi * (train_df['month']-1) / 12)
train_df['day'] = np.cos(2 * np.pi * (train_df['day']-1) / 31)



catboost_encoder = CatBoostEncoder()
catboost_encoder.fit(train_df["job"],train_df["y"])
train_df["job"] = catboost_encoder.transform((train_df["job"]))
dump(catboost_encoder,open(f'../artifacts/cat-boost-encoders/job.pkl','wb'))

for feature,one_hot_encoder in one_hot_encoders.items():
    one_hot_encoder.fit(train_df[[feature]])
    encoded_data = one_hot_encoder.transform(train_df[[feature]])
    encoded_df = pd.DataFrame(encoded_data,columns=[f"{feature}_0",f"{feature}_1"])
    train_df = pd.concat([encoded_df,train_df],axis=1)
    train_df.drop([feature],inplace=True,axis=1)
    dump(one_hot_encoder,open(f'../artifacts/one-hot-encoders/{feature}.pkl','wb'))

In [16]:
train_df.head()

Unnamed: 0,contact_0,contact_1,marital_0,marital_1,age,job,education,default,balance,housing,loan,day,month,duration,campaign,previous,poutcome,y
0,0.0,1.0,1.0,0.0,0.769231,0.105979,3,0,0.760385,1,0,0.688967,0.866025,0.411672,0.693147,0.0,-1,0
1,0.0,1.0,0.0,1.0,0.5,0.080274,2,0,0.361216,1,0,0.688967,0.866025,0.23817,0.693147,0.0,-1,0
2,0.0,1.0,1.0,0.0,0.288462,0.047316,2,0,0.356118,1,1,0.688967,0.866025,0.119874,0.693147,0.0,-1,0
3,0.0,1.0,1.0,0.0,0.557692,0.040846,-1,0,0.640106,1,0,0.688967,0.866025,0.14511,0.693147,0.0,-1,0
4,0.0,1.0,0.0,1.0,0.288462,0.100917,-1,0,0.355929,0,0,0.688967,0.866025,0.312303,0.693147,0.0,-1,0


In [112]:
train_df.to_csv("../artifacts/transformed data/transformed.csv",index=False)