<h1 style="color: green;">Summary</h1>
<p>
In this section the base transformers are calculated and saved for deployment<br>
The following tasks are performed:
<ul>
<li>Capping outliers for Total_Price and Expenditure_per_wk</li>
<li>Binning high cardinality numeric features</li>
<li>Categorical encoding rare labels</li>
<li>Categorical encoding monotonic (WOE)</li>
<li>Saving the encoders for deployment pipeline transformations</li>

</ul>
All of this is done with the raw data extracted in <b>Data Extraction</b><br>
The full sample dataset i.e. before train test split.<br>
</p>


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# for feature engineering
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import ArbitraryOutlierCapper

# for Weight of evidence
from feature_engine.encoding import WoEEncoder
from category_encoders.woe import WOEEncoder


# joblib for storing models
import joblib

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [7]:
features = ['Total_Nbr_of_Items', 'Venue', 'Date_diff', 'Nbr_items_per_wk',
               'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks', 'Vegetables',
               'Cosmetics_and_selfcare', 'House_and_kitchen', 'Bread_wk',
               'Cooked_meats_wk', 'Raw_meats_wk', 'Snacks_wk', 'Snacks_exp_receipt',
               'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
               'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Seasoning_wk',
               'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
               'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk','Trips_response_lt_5']

In [15]:
raw0 = pd.read_csv("../Data/homeshopping.csv")


In [16]:
raw0 = raw0[features]

# Capping Expenditure_per_wk

In [17]:
raw_cap = raw0.drop(['Trips_response_lt_5'], axis=1)
raw_cap.head()

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,...,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
0,1,eBay,0.0,3,17.16,0.17366,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0.0,0
1,1,eBay,0.0,3,17.16,0.191725,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0.0,0
2,1,eBay,0.0,3,17.16,0.634615,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0.0,0
3,1,Abbey Sports,5.0,4,28.69,0.348554,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0.0,0
4,1,eBay,1.0,4,28.69,0.205995,0,0,0,0,...,0.0,0,0,0,0,0,0,0,0.0,0


In [28]:
# capping the values at 200 for Expenditure_per_wk
# Since Total_Price did not make it into the final model

capper = ArbitraryOutlierCapper(
    max_capping_dict={ 'Expenditure_per_wk': 200},
    min_capping_dict=None,
)

capper.fit(raw_cap)


joblib.dump(capper,"../Data transformers/capper_expenditure12062023")
raw_cap = capper.transform(raw_cap)

['../Data transformers/capper_expenditure12062023']

# Binning Expenditure_per_wk

In [29]:
list1 = ['Expenditure_per_wk']

# binning list1 separately
disc1 = EqualWidthDiscretiser(bins=6, variables = list1)
disc1.fit(raw_cap)

joblib.dump(disc1,"../Data transformers/Equal_width_bin_expenditure12062023")

raw_cap = disc1.transform(raw_cap)

['../Data transformers/Equal_width_bin_expenditure12062023']

# Bin high cardinality features

In [38]:
high_card_num = ['Total_Nbr_of_Items', 'Nbr_items_per_wk', 'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks',
 'Vegetables', 'Cosmetics_and_selfcare', 'Bread_wk', 'Cooked_meats_wk', 'Raw_meats_wk',
 'Snacks_wk', 'Snacks_exp_receipt', 'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
 'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
 'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk'
]

# Binning high cardinality numeric features
# note, this includes the list1 features
disc = EqualWidthDiscretiser(bins=6, variables = high_card_num)
disc.fit(raw_cap[high_card_num])

joblib.dump(disc,"../Data transformers/Hig_cardinality_12062023")

raw_cap_high_card = disc.transform(raw_cap[high_card_num])

# Rare value encode for high cardinality features

In [39]:
# change variable type to object in preparation for rare value fit and transform
for var in high_card_num:
    raw_cap_high_card[var] = pd.Series(raw_cap_high_card[var], dtype=object)

In [40]:
# Rare value encoder for numeric high cardinality features
rare_encoder_high_card = RareLabelEncoder(
    tol=0.047,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    
    #variables=high_card_num,  # variables to re-group
    
    # using the variable list without Date_diff
    variables=high_card_num,  # variables to re-group
)
rare_encoder_high_card.fit(raw_cap_high_card)

joblib.dump(rare_encoder_high_card,"../Data transformers/Rare_enc_High_cardinality_12062023")

raw_cap_high_card = rare_encoder_high_card.transform(raw_cap_high_card)

# Binning Date_diff

In [31]:
Arbitrary_disc = ArbitraryDiscretiser(
    binning_dict = {
        "Date_diff": ['-inf',0, 1, 2, 3, 4,'inf']},
)

Arbitrary_disc.fit(raw_cap)

joblib.dump(Arbitrary_disc,"../Data transformers/Date_diff_transformer_12062023")

raw_cap = Arbitrary_disc.transform(raw_cap)

# convert Date_diff to object
raw_cap['Date_diff'] = pd.Series(raw_cap['Date_diff'], dtype=object)

['../Data transformers/Date_diff_transformer_12062023']

# Binning low cardinality features rare values

In [33]:
low_card_num = ['House_and_kitchen', 'Seasoning_wk']

# change variable type to object in preparation for rare label encoding
for var in low_card_num:
    raw_cap[var] = pd.Series(raw_cap[var], dtype=object)

In [34]:
# Rare value encoder for numeric low cardinality
rare_encoder_low_card = RareLabelEncoder(
    tol=0.047,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    variables=low_card_num,  # variables to re-group
)

rare_encoder_low_card.fit(raw_cap)

joblib.dump(rare_encoder_low_card,"../Data transformers/low_cardinality_rare_transformer_12062023")

['../Data transformers/low_cardinality_rare_transformer_12062023']

# Categorical feature encoding

In [36]:
Cat_feat = ['Venue']

# Rare value encoder for categorical features
rare_encoder_cat = RareLabelEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=3,  # minimal number of categories the variable should have to re-group rare categories
    variables=Cat_feat,  # variables to re-group
)

rare_encoder_cat.fit(raw_cap)


joblib.dump(rare_encoder_cat,"../Data transformers/Categorical_rare_transformer_12062023")

['../Data transformers/Categorical_rare_transformer_12062023']

# WOE encoding 