<h1 style="color: green;">Summary</h1>
<p>
In this section the base transformers are calculated and saved for deployment<br>
The following tasks are performed:
<ul>
<li>Capping outliers for Total_Price and Expenditure_per_wk</li>
<li>Binning high cardinality numeric features</li>
<li>Categorical encoding rare labels</li>
<li>Categorical encoding monotonic (WOE)</li>
<li>Saving the encoders for deployment pipeline transformations</li>

</ul>
All of this is done with the raw data extracted in <b>Data Extraction</b><br>
The full sample dataset i.e. before train test split.<br>
</p>


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# for feature engineering
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import ArbitraryOutlierCapper

# for Weight of evidence
from feature_engine.encoding import WoEEncoder
from category_encoders.woe import WOEEncoder


# joblib for storing models
import joblib

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

# display all columns
pd.set_option('display.max_columns', None)

In [2]:
features = ['Total_Nbr_of_Items', 'Venue', 'Date_diff', 'Nbr_items_per_wk',
               'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks', 'Vegetables',
               'Cosmetics_and_selfcare', 'House_and_kitchen', 'Bread_wk',
               'Cooked_meats_wk', 'Raw_meats_wk', 'Snacks_wk', 'Snacks_exp_receipt',
               'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
               'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Seasoning_wk',
               'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
               'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk']

In [3]:
raw0 = pd.read_csv("../Data/homeshopping.csv")


# Capping Expenditure_per_wk

In [4]:
Trips_response_lt_5 = raw0.Trips_response_lt_5
raw_cap = raw0[features]

In [5]:
# capping the values at 200 for Expenditure_per_wk
# Since Total_Price did not make it into the final model

capper = ArbitraryOutlierCapper(
    max_capping_dict={ 'Expenditure_per_wk': 200},
    min_capping_dict=None,
)

capper.fit(raw_cap)


joblib.dump(capper,"../Data transformers/capper_expenditure12062023")
raw_cap = capper.transform(raw_cap)

# Binning Expenditure_per_wk

In [6]:
list1 = ['Expenditure_per_wk']

# binning list1 separately
disc1 = EqualWidthDiscretiser(bins=6, variables = list1)
disc1.fit(raw_cap)

joblib.dump(disc1,"../Data transformers/Equal_width_bin_expenditure12062023")

raw_cap = disc1.transform(raw_cap)

# Bin high cardinality features

In [7]:
high_card_num = ['Total_Nbr_of_Items', 'Nbr_items_per_wk', 'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks',
 'Vegetables', 'Cosmetics_and_selfcare', 'Bread_wk', 'Cooked_meats_wk', 'Raw_meats_wk',
 'Snacks_wk', 'Snacks_exp_receipt', 'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
 'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
 'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk'
]

# Binning high cardinality numeric features
# note, this includes the list1 features
disc = EqualWidthDiscretiser(bins=6, variables = high_card_num)
# disc.fit(raw_cap[high_card_num])
disc.fit(raw_cap)

joblib.dump(disc,"../Data transformers/Hig_cardinality_12062023")

# raw_cap_high_card = disc.transform(raw_cap[high_card_num])
raw_cap = disc.transform(raw_cap)

# Rare value encode for high cardinality features

In [8]:
# change variable type to object in preparation for rare value fit and transform
for var in high_card_num:
    #raw_cap_high_card[var] = pd.Series(raw_cap_high_card[var], dtype=object)
    raw_cap[var] = pd.Series(raw_cap[var], dtype=object)

In [9]:
# Rare value encoder for numeric high cardinality features
rare_encoder_high_card = RareLabelEncoder(
    tol=0.047,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    
    #variables=high_card_num,  # variables to re-group
    
    # using the variable list without Date_diff
    variables=high_card_num,  # variables to re-group
)
# rare_encoder_high_card.fit(raw_cap_high_card)
rare_encoder_high_card.fit(raw_cap)

joblib.dump(rare_encoder_high_card,"../Data transformers/Rare_enc_High_cardinality_12062023")

# raw_cap_high_card = rare_encoder_high_card.transform(raw_cap_high_card)
raw_cap = rare_encoder_high_card.transform(raw_cap)

# Binning Date_diff

In [10]:
Arbitrary_disc = ArbitraryDiscretiser(
    binning_dict = {
        "Date_diff": ['-inf',0, 1, 2, 3, 4,'inf']},
)

Arbitrary_disc.fit(raw_cap)

joblib.dump(Arbitrary_disc,"../Data transformers/Date_diff_transformer_12062023")

raw_cap = Arbitrary_disc.transform(raw_cap)

# convert Date_diff to object
raw_cap['Date_diff'] = pd.Series(raw_cap['Date_diff'], dtype=object)

# Binning low cardinality features rare values

In [11]:
low_card_num = ['House_and_kitchen', 'Seasoning_wk']

# change variable type to object in preparation for rare label encoding
for var in low_card_num:
    raw_cap[var] = pd.Series(raw_cap[var], dtype=object)

In [12]:
# Rare value encoder for numeric low cardinality
rare_encoder_low_card = RareLabelEncoder(
    tol=0.047,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    variables=low_card_num,  # variables to re-group
)

rare_encoder_low_card.fit(raw_cap)

joblib.dump(rare_encoder_low_card,"../Data transformers/low_cardinality_rare_transformer_12062023")

raw_cap = rare_encoder_low_card.transform(raw_cap)

# Categorical feature encoding

In [13]:
Cat_feat = ['Venue']

# Rare value encoder for categorical features
rare_encoder_cat = RareLabelEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    variables=Cat_feat,  # variables to re-group
)

rare_encoder_cat.fit(raw_cap)


joblib.dump(rare_encoder_cat,"../Data transformers/Categorical_rare_transformer_12062023")

raw_cap = rare_encoder_cat.transform(raw_cap)

# WOE encoding
This would be all of the features together

In [14]:
woe_enc = WOEEncoder()
woe_enc.fit(raw_cap, Trips_response_lt_5)

joblib.dump(woe_enc,"../Data transformers/WOE_transformer_12062023")

raw_cap = woe_enc.transform(raw_cap)


In [15]:
raw_cap.head()

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
0,-0.30415,-0.228713,-0.595665,1.431712,1.24397,0.293031,0.026222,-0.200055,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,0.659531,0.049556,-0.113909,0.15382,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,-0.0479,0.092007
1,-0.30415,-0.228713,-0.595665,1.431712,1.24397,0.293031,0.026222,-0.200055,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,0.659531,0.049556,-0.113909,0.15382,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,-0.0479,0.092007
2,-0.30415,-0.228713,-0.595665,1.431712,1.24397,1.917409,0.026222,-0.200055,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,0.659531,0.049556,-0.113909,0.15382,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,-0.0479,0.092007
3,-0.30415,-0.228713,1.618723,1.431712,1.24397,1.229109,0.026222,-0.200055,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,0.659531,0.049556,-0.113909,0.15382,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,-0.0479,0.092007
4,-0.30415,-0.228713,-0.619559,1.431712,1.24397,0.293031,0.026222,-0.200055,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,0.659531,0.049556,-0.113909,0.15382,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,-0.0479,0.092007
