In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
from feature_engine import imputation as mdi

In [72]:
import altair as alt
import os
os.getcwd()

data = pd.read_csv('data/marketing_campaign.csv', sep=';')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [232]:
pd.set_option('display.max_columns', None)



X_train, X_test, y_train, y_test = train_test_split(data.drop(
    ['ID', 'Response'], axis=1),
    data['Response'],
    test_size=0.1,
    random_state=0)

In [233]:
# check cardinality and remove vars with single value
X_train.nunique()

Year_Birth               59
Education                 5
Marital_Status            8
Income                 1800
Kidhome                   3
Teenhome                  3
Dt_Customer             657
Recency                 100
MntWines                746
MntFruits               155
MntMeatProducts         540
MntFishProducts         176
MntSweetProducts        173
MntGoldProds            207
NumDealsPurchases        15
NumWebPurchases          15
NumCatalogPurchases      14
NumStorePurchases        14
NumWebVisitsMonth        15
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp1              2
AcceptedCmp2              2
Complain                  2
Z_CostContact             1
Z_Revenue                 1
dtype: int64

In [234]:
X_train.drop(columns = ['Z_CostContact', 'Z_Revenue'], inplace=True)
X_test.drop(columns = ['Z_CostContact', 'Z_Revenue'], inplace=True)

In [235]:
# cardinality of all variables
# dep var
# other vars
# missing data
# make lists of variable types
year_vars = ['Year_Birth']
dt_vars = ['Dt_Customer']
# tenure with business in months

X_train.isna().mean()
# so none of the data is missing but small % for incode
X_train[X_train['Income'].isna()==True]
# so only need to impute income

# now I want to model as survival model using recency
recency_info = X_train['Recency'].value_counts().reset_index().rename(columns={'Recency':'Cnt', 'index':'Recency'})
alt.Chart(recency_info.tail(10)).mark_bar().encode(
    y='Recency:O',
    x='Cnt:Q'
)

In [236]:
# numeric variables we wish to treat as discrete -> discrete means a numeric var with countable nnumber of events e.g. poisson/binomial etc
discrete = [
    var for var in X_train.columns if X_train[var].dtype != 'O'
        and len(X_train[var].unique()) < 20 
        and var not in year_vars + dt_vars
        and len(X_train[var].unique())>2
        and var != 'Recency'

]

categorical = [
    var for var in X_train.columns if X_train[var].dtype == 'O' 
    and var not in year_vars + dt_vars
    and len(X_train[var].unique())>2
]


numerical = [
    var for var in X_train.columns if X_train[var].dtype != 'O'
    if      var not in discrete 
        and var not in ['ID', 'SalePrice', 'Recency']
        and var not in year_vars + dt_vars
        and len(X_train[var].unique())>2

    
]


ohe_vars = [var for var in  X_train.columns if len(X_train[var].unique()) == 2]



print('There are {} continuous variables'.format(len(numerical)))
print('There are {} discrete variables'.format(len(discrete)))
print('There are {} categorical variables'.format(len(categorical)))
print('There are {} ohe_vars variables'.format(len(ohe_vars)))

There are 7 continuous variables
There are 7 discrete variables
There are 2 categorical variables
There are 6 ohe_vars variables


In [237]:
numerical

['Income',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds']

In [238]:
discrete

['Kidhome',
 'Teenhome',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth']

In [239]:
categorical

['Education', 'Marital_Status']

In [240]:
ohe_vars

['AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'AcceptedCmp1',
 'AcceptedCmp2',
 'Complain']

In [241]:
X_train['Complain'].value_counts()

0    1999
1      17
Name: Complain, dtype: int64

In [242]:
min_dt_train = pd.to_datetime(X_train['Dt_Customer']).min()
max_dt_train = pd.to_datetime(X_train['Dt_Customer']).max()


def add_age__add_tenure(df):
    # capture difference between year variable and
    # year the house was sold
    # want to then put age into brackets as we can see outliers
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
    df['cust_age'] = max_dt_train.year - df['Year_Birth']
    df['cust_tenure'] = (df['Dt_Customer'].dt.year - min_dt_train.year) * 12
    df['cust_tenure'] = df['cust_tenure'] + (df['Dt_Customer'].dt.month-min_dt_train.month)

    return df

X_train = add_age__add_tenure(X_train)
X_test = add_age__add_tenure(X_test)

# drop YrSold
X_train.drop(year_vars + dt_vars, axis=1, inplace=True)
X_test.drop(year_vars + dt_vars, axis=1, inplace=True)


# discretise vars
discretise_vars = ['cust_age', 'cust_tenure']

In [243]:
X_train[discrete] = X_train[discrete].astype('O')
X_test[discrete] = X_test[discrete].astype('O')

In [244]:
X_train[ohe_vars] = X_train[ohe_vars].astype('O')
X_test[ohe_vars] = X_test[ohe_vars].astype('O')

In [245]:
# get missing column info
X_train[X_train.columns[(X_train.isnull().mean()>0)==True]].isnull().mean()

Income    0.009425
dtype: float64

In [246]:
X_train[categorical].nunique()

Education         5
Marital_Status    8
dtype: int64

In [247]:
X_train[discrete].nunique()

Kidhome                 3
Teenhome                3
NumDealsPurchases      15
NumWebPurchases        15
NumCatalogPurchases    14
NumStorePurchases      14
NumWebVisitsMonth      15
dtype: int64

In [248]:
# one hot encode things or keep things all ordinal to make my life easy for now!

In [249]:
# sklearns pipeline
from sklearn.pipeline import Pipeline

# for feature engineering
from sklearn.preprocessing import StandardScaler
from feature_engine import imputation as mdi
from feature_engine import discretisation as dsc
from feature_engine import encoding as ce
from feature_engine.discretisation import EqualFrequencyDiscretiser

In [250]:
X_train.dtypes

Education               object
Marital_Status          object
Income                 float64
Kidhome                 object
Teenhome                object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases       object
NumWebPurchases         object
NumCatalogPurchases     object
NumStorePurchases       object
NumWebVisitsMonth       object
AcceptedCmp3            object
AcceptedCmp4            object
AcceptedCmp5            object
AcceptedCmp1            object
AcceptedCmp2            object
Complain                object
cust_age                 int64
cust_tenure              int64
dtype: object

In [251]:
data_prep_pipe = Pipeline([

    # missing data imputation - section 4
    ('missing_ind',
     mdi.AddMissingIndicator(
         variables=['Income'])),

    ('imputer_num',
     mdi.MeanMedianImputer(
         imputation_method='mean',
         variables=['Income'])),
    
    # ohe
    ('ohe',
     ce.OneHotEncoder(top_categories=2, 
                      variables=ohe_vars, 
                      drop_last=False)
    ),
    
    # rare label encoding
    ('rare_label_enc',
     ce.RareLabelEncoder(tol=0.01, n_categories=2, variables=categorical + discrete)),
    
    

    # discretise age and tenure
    ('disc_age_tenure',
    dsc.EqualFrequencyDiscretiser(q=10, 
                                  variables=discretise_vars,
                                  return_object=True)),
    # simple ordinal encoder
    ('ordinal_enc',
     ce.OrdinalEncoder(encoding_method='ordered',
                       variables=categorical + discrete + discretise_vars))

])

In [252]:
data_prep_pipe.fit_transform(X_train, y_train)[discretise_vars].dtypes

cust_age       int64
cust_tenure    int64
dtype: object

In [253]:
X_train_2 = data_prep_pipe.transform(X_train)

In [254]:
X_train_2[discrete+categorical+discretise_vars].nunique()

Kidhome                 3
Teenhome                3
NumDealsPurchases       9
NumWebPurchases        13
NumCatalogPurchases    12
NumStorePurchases      13
NumWebVisitsMonth      10
Education               5
Marital_Status          6
cust_age               10
cust_tenure            10
dtype: int64

In [255]:
X_train_2.nunique()

Education                 5
Marital_Status            6
Income                 1801
Kidhome                   3
Teenhome                  3
Recency                 100
MntWines                746
MntFruits               155
MntMeatProducts         540
MntFishProducts         176
MntSweetProducts        173
MntGoldProds            207
NumDealsPurchases         9
NumWebPurchases          13
NumCatalogPurchases      12
NumStorePurchases        13
NumWebVisitsMonth        10
cust_age                 10
cust_tenure              10
Income_na                 2
AcceptedCmp3_0            2
AcceptedCmp3_1            2
AcceptedCmp4_0            2
AcceptedCmp4_1            2
AcceptedCmp5_0            2
AcceptedCmp5_1            2
AcceptedCmp1_0            2
AcceptedCmp1_1            2
AcceptedCmp2_0            2
AcceptedCmp2_1            2
Complain_0                2
Complain_1                2
dtype: int64

In [256]:
# ok now we're ready to rock and roll
# so now I want to do the following -> what's my time var -> it was recency so hsould not be touching
# it as it represents survival time!

In [257]:
X_train_2.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,cust_age,cust_tenure,Income_na,AcceptedCmp3_0,AcceptedCmp3_1,AcceptedCmp4_0,AcceptedCmp4_1,AcceptedCmp5_0,AcceptedCmp5_1,AcceptedCmp1_0,AcceptedCmp1_1,AcceptedCmp2_0,AcceptedCmp2_1,Complain_0,Complain_1
831,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,9,9,0,1,0,1,0,1,0,1,0,1,0,1,0
18,3,1,76995.0,2,0,91,1012,80,498,0,16,176,0,12,4,8,1,6,7,0,1,0,1,0,1,0,0,1,1,0,1,0
200,2,2,69142.0,2,0,50,448,4,34,6,4,39,2,7,2,10,1,4,0,0,1,0,0,1,1,0,1,0,1,0,1,0
964,2,1,50183.0,1,0,47,97,12,84,13,10,15,5,6,2,12,1,8,0,0,1,0,1,0,1,0,1,0,1,0,1,0
517,2,3,26095.0,1,2,77,11,7,9,3,1,11,3,2,0,1,3,9,5,0,1,0,1,0,1,0,1,0,1,0,1,0
