## 1 Preliminaries

## 1.1 Imports and Configurations

In [1]:
import warnings
import pandas as pd

from pathlib import Path
from pandas.api.types import CategoricalDtype

warnings.filterwarnings('ignore')

### Constants

In [2]:
# 1 Preliminaries

MARKETING_PATH =  Path('../data/raw/marketing_campaign.xls')

FEATURES_TO_DEL = ['ID', 'Z_CostContact', 'Z_Revenue']
FEATURES_NUM = ['Year_Birth', 'Income', 'Recency', 'MntWines', 'MntFruits',  'MntMeatProducts', 
                'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
                'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
FEATURES_CAT = ['Marital_Status', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 
                'AcceptedCmp5', 'Complain', 'Response']
FEATURES_DATE = ['Dt_Customer']

FEATURES_ORD_LEVELS = {
    'Education': ['Basic', 'Graduation', 'Master', '2n Cycle', 'PhD'],
    'Kidhome': [0, 1, 2],
    'Teenhome': [0, 1, 2],
}

## 1.2 Data Preprocessing

### Read

In [3]:
def read_data(path):
    return pd.read_csv(path, sep='\t')

In [4]:
df = read_data(MARKETING_PATH)
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


### Clean

In [5]:
def clean(df, features_to_del):
    df = df.drop(columns=features_to_del)
    return df

In [6]:
df_cleaned = clean(df, FEATURES_TO_DEL) 
df_cleaned.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,...,10,4,7,0,0,0,0,0,0,1
1,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,...,1,2,5,0,0,0,0,0,0,0
2,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,...,2,10,4,0,0,0,0,0,0,0
3,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,...,0,4,6,0,0,0,0,0,0,0
4,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,...,3,6,5,0,0,0,0,0,0,0


### Encode

In [7]:
def encode(df, features_cat, features_date, features_ord_levels):
    def encode_cat(df, features_cat):
        for feature in features_cat:
            df[feature] = df[feature].astype('category')
        return df

    def encode_date(df, features_date):
        for feature in features_date:
            df[feature] = pd.to_datetime(df[feature])
        return df

    def encode_ord(df, features_ord_levels):
        for feature, levels in features_ord_levels.items():
            df[feature] = df[feature].astype(CategoricalDtype(levels, ordered=True))
        return df
    
    df = encode_cat(df, features_cat)
    df = encode_date(df, features_date)
    df = encode_ord(df, features_ord_levels)
    return df

In [8]:
df_encoded = encode(df_cleaned, FEATURES_CAT,  FEATURES_DATE, FEATURES_ORD_LEVELS)
df_encoded.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,1957,Graduation,Single,58138.0,0,0,2012-04-09,58,635,88,...,10,4,7,0,0,0,0,0,0,1
1,1954,Graduation,Single,46344.0,1,1,2014-08-03,38,11,1,...,1,2,5,0,0,0,0,0,0,0
2,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,...,2,10,4,0,0,0,0,0,0,0
3,1984,Graduation,Together,26646.0,1,0,2014-10-02,26,11,4,...,0,4,6,0,0,0,0,0,0,0
4,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,...,3,6,5,0,0,0,0,0,0,0


In [9]:
df_encoded.dtypes

Year_Birth                      int64
Education                    category
Marital_Status               category
Income                        float64
Kidhome                      category
Teenhome                     category
Dt_Customer            datetime64[ns]
Recency                         int64
MntWines                        int64
MntFruits                       int64
MntMeatProducts                 int64
MntFishProducts                 int64
MntSweetProducts                int64
MntGoldProds                    int64
NumDealsPurchases               int64
NumWebPurchases                 int64
NumCatalogPurchases             int64
NumStorePurchases               int64
NumWebVisitsMonth               int64
AcceptedCmp3                 category
AcceptedCmp4                 category
AcceptedCmp5                 category
AcceptedCmp1                 category
AcceptedCmp2                 category
Complain                     category
Response                     category
dtype: objec

### Missing Values

In [10]:
def impute(df):
    return df.dropna()

In [11]:
df_imputed = impute(df_encoded)
df_imputed.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,1957,Graduation,Single,58138.0,0,0,2012-04-09,58,635,88,...,10,4,7,0,0,0,0,0,0,1
1,1954,Graduation,Single,46344.0,1,1,2014-08-03,38,11,1,...,1,2,5,0,0,0,0,0,0,0
2,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,...,2,10,4,0,0,0,0,0,0,0
3,1984,Graduation,Together,26646.0,1,0,2014-10-02,26,11,4,...,0,4,6,0,0,0,0,0,0,0
4,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,...,3,6,5,0,0,0,0,0,0,0


In [12]:
df_imputed.isna().sum().sum()

0

### Summary

In [13]:
def load_data(path, features_to_del, features_cat, features_date, features_features_ord_levels):
    df = read_data(path)
    df = clean(df, features_to_del)
    df = encode(df, features_cat, features_date, features_features_ord_levels)
    df = impute(df)
    return df

In [14]:
df = load_data(MARKETING_PATH, FEATURES_TO_DEL, FEATURES_CAT, FEATURES_DATE, FEATURES_ORD_LEVELS)
df.head()

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,1957,Graduation,Single,58138.0,0,0,2012-04-09,58,635,88,...,10,4,7,0,0,0,0,0,0,1
1,1954,Graduation,Single,46344.0,1,1,2014-08-03,38,11,1,...,1,2,5,0,0,0,0,0,0,0
2,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,...,2,10,4,0,0,0,0,0,0,0
3,1984,Graduation,Together,26646.0,1,0,2014-10-02,26,11,4,...,0,4,6,0,0,0,0,0,0,0
4,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,...,3,6,5,0,0,0,0,0,0,0


## 2 Feature Utility Scores

## 2.1 MI Scores