In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from feature_engine import imputation as mdi

In [3]:
import altair as alt
import os
os.getcwd()

data = pd.read_csv('data/marketing_campaign.csv', sep=';')
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [4]:
pd.set_option('display.max_columns', None)



X_train, X_test, y_train, y_test = train_test_split(data.drop(
    ['ID', 'Response'], axis=1),
    data['Response'],
    test_size=0.1,
    random_state=0)

In [5]:
# check cardinality and remove vars with single value
X_train.nunique()

Year_Birth               59
Education                 5
Marital_Status            8
Income                 1800
Kidhome                   3
Teenhome                  3
Dt_Customer             657
Recency                 100
MntWines                746
MntFruits               155
MntMeatProducts         540
MntFishProducts         176
MntSweetProducts        173
MntGoldProds            207
NumDealsPurchases        15
NumWebPurchases          15
NumCatalogPurchases      14
NumStorePurchases        14
NumWebVisitsMonth        15
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp1              2
AcceptedCmp2              2
Complain                  2
Z_CostContact             1
Z_Revenue                 1
dtype: int64

In [6]:
X_train.drop(columns = ['Z_CostContact', 'Z_Revenue'], inplace=True)
X_test.drop(columns = ['Z_CostContact', 'Z_Revenue'], inplace=True)

In [7]:
# cardinality of all variables
# dep var
# other vars
# missing data
# make lists of variable types
year_vars = ['Year_Birth']
dt_vars = ['Dt_Customer']
# tenure with business in months

X_train.isna().mean()
# so none of the data is missing but small % for incode
X_train[X_train['Income'].isna()==True]
# so only need to impute income

# now I want to model as survival model using recency
recency_info = X_train['Recency'].value_counts().reset_index().rename(columns={'Recency':'Cnt', 'index':'Recency'})
alt.Chart(recency_info.tail(10)).mark_bar().encode(
    y='Recency:O',
    x='Cnt:Q'
)

In [8]:
# numeric variables we wish to treat as discrete -> discrete means a numeric var with countable nnumber of events e.g. poisson/binomial etc
discrete = [
    var for var in X_train.columns if X_train[var].dtype != 'O'
        and len(X_train[var].unique()) < 20 
        and var not in year_vars + dt_vars
        and len(X_train[var].unique())>2
        and var != 'Recency'

]

categorical = [
    var for var in X_train.columns if X_train[var].dtype == 'O' 
    and var not in year_vars + dt_vars
    and len(X_train[var].unique())>2
]


numerical = [
    var for var in X_train.columns if X_train[var].dtype != 'O'
    if      var not in discrete 
        and var not in ['ID', 'SalePrice', 'Recency']
        and var not in year_vars + dt_vars
        and len(X_train[var].unique())>2

    
]


ohe_vars = [var for var in  X_train.columns if len(X_train[var].unique()) == 2]



print('There are {} continuous variables'.format(len(numerical)))
print('There are {} discrete variables'.format(len(discrete)))
print('There are {} categorical variables'.format(len(categorical)))
print('There are {} ohe_vars variables'.format(len(ohe_vars)))

There are 7 continuous variables
There are 7 discrete variables
There are 2 categorical variables
There are 6 ohe_vars variables


In [9]:
numerical

['Income',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds']

In [10]:
discrete

['Kidhome',
 'Teenhome',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth']

In [11]:
categorical

['Education', 'Marital_Status']

In [12]:
ohe_vars

['AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'AcceptedCmp1',
 'AcceptedCmp2',
 'Complain']

In [13]:
X_train['Complain'].value_counts()

0    1999
1      17
Name: Complain, dtype: int64

In [None]:
# visualise distributions of continous variables -> check for outliers and decide if you want to discretise into groups 

In [14]:
min_dt_train = pd.to_datetime(X_train['Dt_Customer']).min()
max_dt_train = pd.to_datetime(X_train['Dt_Customer']).max()


def add_age__add_tenure(df):
    # capture difference between year variable and
    # year the house was sold
    # want to then put age into brackets as we can see outliers
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
    df['cust_age'] = max_dt_train.year - df['Year_Birth']
    df['cust_tenure'] = (df['Dt_Customer'].dt.year - min_dt_train.year) * 12
    df['cust_tenure'] = df['cust_tenure'] + (df['Dt_Customer'].dt.month-min_dt_train.month)

    return df

X_train = add_age__add_tenure(X_train)
X_test = add_age__add_tenure(X_test)

# drop YrSold
X_train.drop(year_vars + dt_vars, axis=1, inplace=True)
X_test.drop(year_vars + dt_vars, axis=1, inplace=True)


# discretise vars
discretise_vars = ['cust_age', 'cust_tenure']

In [15]:
X_train[discrete] = X_train[discrete].astype('O')
X_test[discrete] = X_test[discrete].astype('O')

In [16]:
X_train[ohe_vars] = X_train[ohe_vars].astype('O')
X_test[ohe_vars] = X_test[ohe_vars].astype('O')

In [17]:
# get missing column info
X_train[X_train.columns[(X_train.isnull().mean()>0)==True]].isnull().mean()

Income    0.009425
dtype: float64

In [18]:
X_train[categorical].nunique()

Education         5
Marital_Status    8
dtype: int64

In [19]:
X_train[discrete].nunique()

Kidhome                 3
Teenhome                3
NumDealsPurchases      15
NumWebPurchases        15
NumCatalogPurchases    14
NumStorePurchases      14
NumWebVisitsMonth      15
dtype: int64

In [20]:
# one hot encode things or keep things all ordinal to make my life easy for now!

In [280]:
# sklearns pipeline
from sklearn.pipeline import Pipeline

# for feature engineering
from sklearn.preprocessing import StandardScaler
from feature_engine import imputation as mdi
from feature_engine import discretisation as dsc
from feature_engine import encoding as ce
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine import selection as sel

In [22]:
X_train.dtypes

Education               object
Marital_Status          object
Income                 float64
Kidhome                 object
Teenhome                object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases       object
NumWebPurchases         object
NumCatalogPurchases     object
NumStorePurchases       object
NumWebVisitsMonth       object
AcceptedCmp3            object
AcceptedCmp4            object
AcceptedCmp5            object
AcceptedCmp1            object
AcceptedCmp2            object
Complain                object
cust_age                 int64
cust_tenure              int64
dtype: object

In [29]:
data_prep_pipe = Pipeline([

    # missing data imputation - section 4
    ('missing_ind',
     mdi.AddMissingIndicator(
         variables=['Income'])),

    ('imputer_num',
     mdi.MeanMedianImputer(
         imputation_method='mean',
         variables=['Income'])),
    
    # ohe - no need as only two levels
    # ('ohe',
    # ce.OneHotEncoder(top_categories=2, 
    #                  variables=ohe_vars, 
    #                  drop_last=False)
    #),
    
    # rare label encoding
    ('rare_label_enc',
     ce.RareLabelEncoder(tol=0.01, n_categories=2, variables=categorical + discrete)),
    
    

    # discretise age and tenure
    ('disc_age_tenure',
    dsc.EqualFrequencyDiscretiser(q=10, 
                                  variables=discretise_vars,
                                  return_object=True)),
    # simple ordinal encoder
    ('ordinal_enc',
     ce.OrdinalEncoder(encoding_method='ordered',
                       variables=categorical + discrete + discretise_vars))

])




52389.726089    19
7500.000000     12
35860.000000     4
46098.000000     3
37760.000000     3
                ..
72217.000000     1
36947.000000     1
83033.000000     1
29999.000000     1
55250.000000     1
Name: Income, Length: 1801, dtype: int64

In [30]:
data_prep_pipe.fit_transform(X_train, y_train)[discretise_vars].dtypes

cust_age       int64
cust_tenure    int64
dtype: object

In [54]:
X_train = data_prep_pipe.transform(X_train)

In [55]:
X_train[discrete+categorical+discretise_vars].nunique()

Kidhome                 3
Teenhome                3
NumDealsPurchases       9
NumWebPurchases        13
NumCatalogPurchases    12
NumStorePurchases      13
NumWebVisitsMonth      10
Education               5
Marital_Status          6
cust_age               10
cust_tenure            10
dtype: int64

In [56]:
X_train.nunique()

Education                 5
Marital_Status            6
Income                 1801
Kidhome                   3
Teenhome                  3
Recency                 100
MntWines                746
MntFruits               155
MntMeatProducts         540
MntFishProducts         176
MntSweetProducts        173
MntGoldProds            207
NumDealsPurchases         9
NumWebPurchases          13
NumCatalogPurchases      12
NumStorePurchases        13
NumWebVisitsMonth        10
AcceptedCmp3              2
AcceptedCmp4              2
AcceptedCmp5              2
AcceptedCmp1              2
AcceptedCmp2              2
Complain                  2
cust_age                 10
cust_tenure              10
Income_na                 2
dtype: int64

In [34]:
# ok now we're ready to rock and roll
# so now I want to do the following -> what's my time var -> it was recency so hsould not be touching
# it as it represents survival time!

In [57]:
X_train.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,cust_age,cust_tenure,Income_na
831,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0
18,3,1,76995.0,2,0,91,1012,80,498,0,16,176,0,12,4,8,1,0,0,0,1,0,0,6,7,0
200,2,2,69142.0,2,0,50,448,4,34,6,4,39,2,7,2,10,1,0,1,0,0,0,0,4,0,0
964,2,1,50183.0,1,0,47,97,12,84,13,10,15,5,6,2,12,1,0,0,0,0,0,0,8,0,0
517,2,3,26095.0,1,2,77,11,7,9,3,1,11,3,2,0,1,3,0,0,0,0,0,0,9,5,0


In [64]:
X_train['Complain'].value_counts()

0    1999
1      17
Name: Complain, dtype: int64

In [59]:
# so we're assuming that these variables do not change over the different recency time points which is a strong assumption to make but it's an ok assumption
# so to create an expanded form view of the data
# we will explode a customer up -> then we set their y-lable time varying that says if time = max time for the person set to their y-label otherwise set to zero
# and there will be some customers that are censored at the end i.e. do not have a y-label
# then we wish to get the hazard for each customer in their last time period only -> i.e haz represents
# prob(T=t_latest/ T>=t_latest) where t_latest represents the latest recency for the customer and says what's the prob they will have the event now that they have the recency that they do 

In [60]:
test = pd.DataFrame({'cust':range(4)})
test['to_explode'] = [np.arange(3) for x in range(len(test))]
test.apply(pd.Series.explode)

Unnamed: 0,cust,to_explode
0,0,0
0,0,1
0,0,2
1,1,0
1,1,1
1,1,2
2,2,0
2,2,1
2,2,2
3,3,0


In [61]:
test = pd.DataFrame({'cust':range(4)})
test['nbr_entries'] = [5, 1, 4, 3]
test['to_explode'] = test['nbr_entries'].apply(lambda x: np.arange(x))
test

Unnamed: 0,cust,nbr_entries,to_explode
0,0,5,"[0, 1, 2, 3, 4]"
1,1,1,[0]
2,2,4,"[0, 1, 2, 3]"
3,3,3,"[0, 1, 2]"


In [63]:
rec_min, rec_max = X_train['Recency'].min(), X_train['Recency'].max()
print(rec_min)
print(rec_max)

0
99


In [192]:
X_train[y_train==1]['Recency'].value_counts()

2     9
3     9
1     8
12    8
40    7
     ..
67    1
66    1
33    1
34    1
99    1
Name: Recency, Length: 91, dtype: int64

In [107]:
X_train[y_train==1].query('Recency<=10')

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,cust_age,cust_tenure,Income_na
2202,4,5,48432.0,2,0,3,322,3,50,4,3,42,4,11,2,12,6,0,0,0,0,0,0,5,9,0
1528,4,3,74004.0,2,2,5,784,48,560,42,176,48,3,5,8,4,5,0,0,1,1,0,0,9,3,0
333,3,1,35178.0,1,2,10,23,1,13,2,2,18,3,2,2,11,3,1,0,0,0,0,0,7,6,0
1001,4,3,95169.0,2,2,1,1285,21,449,106,20,20,3,5,3,4,8,0,0,1,1,0,0,9,4,0
33,4,2,46610.0,2,1,8,96,12,96,33,22,43,6,5,2,12,4,0,0,0,0,0,0,4,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,1,96547.0,2,2,4,448,21,125,52,101,62,8,11,8,5,7,1,0,1,1,0,0,8,0,0
1053,1,2,25959.0,1,0,1,4,2,12,7,5,26,0,2,1,11,4,0,0,0,0,0,0,7,6,0
749,1,1,63564.0,2,2,0,769,80,252,15,34,65,3,10,11,10,4,1,0,0,0,0,0,4,3,0
1329,4,0,71604.0,2,2,3,345,53,528,98,75,97,3,7,3,9,2,1,0,0,0,0,0,6,4,0


In [108]:
y_train.loc[2202]

1

In [94]:

def surv_analysis_df(df_x, df_y):
    df = df_x.copy()
    df['TARGET_TEMP'] = df_y
    df=df.reset_index(drop=True)
    df['time'] = df['Recency'].apply(lambda x: np.arange(x+1))
    df = df.apply(pd.Series.explode).reset_index(drop=True)
    # now set the y to zero and set to 1 in last record case for the cust case
    df['TARGET'] = np.where(df['time'] == df['Recency'], df['TARGET_TEMP'], 0)
    df.drop(columns='TARGET_TEMP', inplace=True)

    X, y = df.drop(columns='TARGET'), df['TARGET']
    return X, y

In [109]:
X_train_surv, y_train_surv = surv_analysis_df(X_train.loc[[2202]], y_train.loc[[2202]])

In [110]:
X_train_surv

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,cust_age,cust_tenure,Income_na,time
0,4,5,48432.0,2,0,3,322,3,50,4,3,42,4,11,2,12,6,0,0,0,0,0,0,5,9,0,0
1,4,5,48432.0,2,0,3,322,3,50,4,3,42,4,11,2,12,6,0,0,0,0,0,0,5,9,0,1
2,4,5,48432.0,2,0,3,322,3,50,4,3,42,4,11,2,12,6,0,0,0,0,0,0,5,9,0,2
3,4,5,48432.0,2,0,3,322,3,50,4,3,42,4,11,2,12,6,0,0,0,0,0,0,5,9,0,3


In [111]:
y_train_surv

0    0
1    0
2    0
3    1
Name: TARGET, dtype: int64

In [335]:
X_train_surv, y_train_surv = surv_analysis_df(X_train, y_train)

In [336]:
X_train_surv.shape

(101219, 27)

In [114]:
X_train_surv.shape

(101219, 27)

(11026, 26)

In [387]:
!pip install h2o



In [388]:
import h2o

In [389]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_281"; Java(TM) SE Runtime Environment (build 1.8.0_281-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.281-b09, mixed mode)
  Starting server from /Users/christopherhassan/opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/pr/lsrf5gy96675_8sjyvrw7m440000gn/T/tmpriq50cup
  JVM stdout: /var/folders/pr/lsrf5gy96675_8sjyvrw7m440000gn/T/tmpriq50cup/h2o_christopherhassan_started_from_python.out
  JVM stderr: /var/folders/pr/lsrf5gy96675_8sjyvrw7m440000gn/T/tmpriq50cup/h2o_christopherhassan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.5
H2O_cluster_version_age:,9 months and 17 days !!!
H2O_cluster_name:,H2O_from_python_christopherhassan_zpyztt
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.111 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [390]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
# move to 

X_mod, X_val, y_mod, y_val = train_test_split(X_train_surv,
                                            y_train_surv,
                                            test_size=0.3,
                                            random_state=0)



[0;31mSignature:[0m
[0mtrain_test_split[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0marrays[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstratify[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets

Quick utility that wraps input validation and
``next(ShuffleSplit().split(X, y))`` and application to input data
into a single call for splitting (and optionally subsampling) data in a
oneliner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same

In [392]:
X_mod['TARGET'] = y_mod
X_val['TARGET'] = y_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [396]:
X_mod['TARGET'].value_counts()/len(X_mod)

0    0.99729
1    0.00271
Name: TARGET, dtype: float64

In [397]:
X_val['TARGET'].value_counts()/len(X_val)

0    0.996641
1    0.003359
Name: TARGET, dtype: float64

In [409]:
X_mod.to_csv('data/mod.csv')
X_val.to_csv('data/val.csv')

In [410]:
h2o_mod = h2o.import_file(path = 'data/mod.csv', destination_frame = 'h2o_mod')
h2o_val = h2o.import_file(path = 'data/val.csv', destination_frame = 'h2o_val')

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [411]:
# convert dummy var as factor
h2o_mod['TARGET'] = h2o_mod['TARGET'].asfactor()
h2o_val['TARGET'] = h2o_val['TARGET'].asfactor()

In [420]:
x_vars = [x for x in X_mod.columns if x.find('Recency')==-1]
x_vars

['Education',
 'Marital_Status',
 'Income',
 'Kidhome',
 'Teenhome',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth',
 'AcceptedCmp3',
 'AcceptedCmp4',
 'AcceptedCmp5',
 'AcceptedCmp1',
 'AcceptedCmp2',
 'Complain',
 'cust_age',
 'cust_tenure',
 'Income_na',
 'time',
 'pred_prob',
 'TARGET']

In [421]:
glm_model = H2OGeneralizedLinearEstimator(
                                   family = 'Binomial',
                                   model_id = 'glm_model',
                                   alpha = 1, # lasso regression, set alpha = 0 for ridge
                                   #lambda_ = 0,
                                   lambda_search = True,
                                   standardize = True,
                                   intercept = True)

glm_model.train(x = x_vars, 
                y = 'TARGET',
                training_frame = h2o_mod,
                validation_frame = h2o_val)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [433]:
glm_model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_model


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,Lasso (lambda = 2.79E-4 ),"nlambda = 100, lambda.max = 0.00344, lambda.min = 2.79E-4, lambda....",27,2,44,py_1_sid_9db4




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.002690250771863874
RMSE: 0.05186762739767334
LogLoss: 0.01680560200307223
Null degrees of freedom: 70852
Residual degrees of freedom: 70850
Null deviance: 2653.2522515384558
Residual deviance: 2381.4546374473534
AIC: 2387.4546374473534
AUC: 0.8208029211068812
AUCPR: 0.01362839260065048
Gini: 0.6416058422137625

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.019314496095362243: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,69941.0,720.0,0.0102,(720.0/70661.0)
1,1,172.0,20.0,0.8958,(172.0/192.0)
2,Total,70113.0,740.0,0.0126,(892.0/70853.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.0193145,0.042918,127.0
1,max f2,0.01042345,0.079717,189.0
2,max f0point5,0.0193145,0.031726,127.0
3,max accuracy,0.08192345,0.997262,0.0
4,max precision,0.07186949,0.045455,10.0
5,max recall,0.0002033451,1.0,396.0
6,max specificity,0.08192345,0.999972,0.0
7,max absolute_mcc,0.006966588,0.073804,226.0
8,max min_per_class_accuracy,0.002902656,0.744792,301.0
9,max mean_per_class_accuracy,0.003407326,0.754911,289.0



Gains/Lift Table: Avg response rate:  0.27 %, avg score:  0.27 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010007,0.01948688,7.807321,7.807321,0.021157,0.033336,0.021157,0.033336,0.078125,0.078125,680.732105,680.732105
1,,2,0.020013,0.01401092,8.848297,8.327809,0.023977,0.016476,0.022567,0.024906,0.088542,0.166667,784.829719,732.780912
2,,3,0.030006,0.01167075,4.691009,7.116683,0.012712,0.012768,0.019285,0.020864,0.046875,0.213542,369.1009,611.668284
3,,4,0.040012,0.0100053,7.286833,7.159235,0.019746,0.010731,0.0194,0.01833,0.072917,0.286458,628.683298,615.923538
4,,5,0.050005,0.008822489,4.691009,6.666008,0.012712,0.009404,0.018064,0.016546,0.046875,0.333333,369.1009,566.600809
5,,6,0.100024,0.005697024,3.332064,4.998801,0.009029,0.006966,0.013546,0.011755,0.166667,0.5,233.206358,399.880062
6,,7,0.150001,0.004339467,2.084304,4.027759,0.005648,0.004955,0.010915,0.009489,0.104167,0.604167,108.43041,302.775883
7,,8,0.200006,0.003518383,1.770658,3.463444,0.004798,0.0039,0.009385,0.008092,0.088542,0.692708,77.06584,246.34439
8,,9,0.300016,0.002513966,0.989486,2.638752,0.002681,0.002968,0.007151,0.006384,0.098958,0.791667,-1.051442,163.875233
9,,10,0.399997,0.001889828,0.573022,2.122411,0.001553,0.002183,0.005751,0.005334,0.057292,0.848958,-42.69782,112.241081




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.003330631824374735
RMSE: 0.05771162642288584
LogLoss: 0.020303564651430447
Null degrees of freedom: 30365
Residual degrees of freedom: 30363
Null deviance: 1370.0604938848187
Residual deviance: 1233.0760884106744
AIC: 1239.0760884106744
AUC: 0.8225386209202159
AUCPR: 0.02435435297866254
Gini: 0.6450772418404318

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.013622141226659824: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,29591.0,673.0,0.0222,(673.0/30264.0)
1,1,83.0,19.0,0.8137,(83.0/102.0)
2,Total,29674.0,692.0,0.0249,(756.0/30366.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.01362214,0.047859,149.0
1,max f2,0.0106861,0.092838,176.0
2,max f0point5,0.08251127,0.04717,0.0
3,max accuracy,0.08251127,0.996674,0.0
4,max precision,0.08251127,1.0,0.0
5,max recall,0.0008369122,1.0,368.0
6,max specificity,0.08251127,1.0,0.0
7,max absolute_mcc,0.08251127,0.09885,0.0
8,max min_per_class_accuracy,0.002905138,0.744746,298.0
9,max mean_per_class_accuracy,0.002531308,0.757296,309.0



Gains/Lift Table: Avg response rate:  0.34 %, avg score:  0.27 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010011,0.02036652,6.85507,6.85507,0.023026,0.033562,0.023026,0.033562,0.068627,0.068627,585.506966,585.506966
1,,2,0.020022,0.01439873,7.834365,7.344717,0.026316,0.017066,0.024671,0.025314,0.078431,0.147059,683.436533,634.471749
2,,3,0.030001,0.01185641,6.877694,7.189385,0.023102,0.013028,0.024149,0.021228,0.068627,0.215686,587.769365,618.938465
3,,4,0.040012,0.01016459,5.875774,6.860712,0.019737,0.010923,0.023045,0.018649,0.058824,0.27451,487.577399,586.071169
4,,5,0.050023,0.008925294,2.937887,6.07563,0.009868,0.009565,0.020408,0.016831,0.029412,0.303922,193.7887,507.563025
5,,6,0.100013,0.005716369,2.353406,4.215131,0.007905,0.00705,0.014159,0.011942,0.117647,0.421569,135.340618,321.513103
6,,7,0.150003,0.004321494,2.549523,3.66005,0.008564,0.004935,0.012294,0.009607,0.127451,0.54902,154.952337,266.005036
7,,8,0.200026,0.003529232,2.351857,3.332894,0.0079,0.003891,0.011195,0.008178,0.117647,0.666667,135.185687,233.28943
8,,9,0.300007,0.002517966,1.470879,2.712359,0.004941,0.002975,0.009111,0.006444,0.147059,0.813725,47.087887,171.235875
9,,10,0.40002,0.00187609,0.78421,2.230282,0.002634,0.002183,0.007492,0.005379,0.078431,0.892157,-21.578958,123.028199




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
0,,2021-04-05 13:36:54,0.000 sec,1,0.0034,1,0.037447,0.045118
1,,2021-04-05 13:36:54,0.052 sec,3,0.0031,2,0.036715,0.044232
2,,2021-04-05 13:36:54,0.073 sec,5,0.0029,2,0.036127,0.043523
3,,2021-04-05 13:36:54,0.092 sec,7,0.0026,2,0.035655,0.042957
4,,2021-04-05 13:36:54,0.113 sec,9,0.0024,2,0.035275,0.042503
5,,2021-04-05 13:36:54,0.132 sec,11,0.0022,2,0.034968,0.042139
6,,2021-04-05 13:36:54,0.152 sec,13,0.002,2,0.034722,0.041847
7,,2021-04-05 13:36:54,0.171 sec,15,0.0018,2,0.034519,0.041609
8,,2021-04-05 13:36:54,0.189 sec,17,0.0016,2,0.034354,0.041417
9,,2021-04-05 13:36:54,0.207 sec,19,0.0015,2,0.034219,0.041261



See the whole table with table.as_data_frame()




In [430]:
val_df = glm_model.predict(h2o_val)['p1'].as_data_frame()
val_df['TARGET'] = h2o_val['TARGET'].as_data_frame()
val_df.head()

glm prediction progress: |████████████████████████████████████████████████| 100%


Unnamed: 0,p1,TARGET
0,0.001463,0
1,0.000438,0
2,0.001066,0
3,0.00083,0
4,0.000589,0


In [431]:
val_df['TARGET'].value_counts()

0    30264
1      102
Name: TARGET, dtype: int64

In [432]:
roc_auc_score(val_df['TARGET'].astype(int), val_df['p1'])

0.8222503083972156

In [None]:
# now what is auc at the correc time when 

In [218]:
# now fit glm lasso 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [219]:
X_train_surv_testing = StandardScaler().fit_transform(X_train_surv)
mod_testing = LogisticRegression(random_state=0, max_iter=1000, solver='saga', penalty='l2').fit(X_train_surv_testing, y_train_surv)

In [220]:
mod_testing.predict(X_train_surv_testing).sum()

13

In [221]:
sum(y_train_surv)/len(y_train_surv)

0.0029045930111935504

In [222]:
sum(y_train)/len(y_train)

0.14583333333333334

In [384]:
tuned_parameters = [{'C': np.logspace(-4, -0.5, 20)}]
n_folds = 2
# don't care that the solution doesnt converge -> relative want the C that does best and then later will increase max_iter
lasso_hyper_parameter = Pipeline(steps=[
                                 ('drop_feats', sel.DropFeatures(features_to_drop=['Recency'])),
                                 ('scaler', StandardScaler()),
                                 ('lasso', GridSearchCV(LogisticRegression(random_state=0,
                                                                           solver='saga',
                                                                           penalty='l1'),
                                                        param_grid = tuned_parameters,
                                                        cv = n_folds,
                                                        refit=False)) # i.e. on best param refit
                                ])

In [368]:
lasso_hyper_parameter.fit(X_train_surv, y_train_surv)



Pipeline(steps=[('drop_feats', DropFeatures(features_to_drop=['Recency'])),
                ('scaler', StandardScaler()),
                ('lasso',
                 GridSearchCV(cv=2,
                              estimator=LogisticRegression(penalty='l1',
                                                           random_state=0,
                                                           solver='saga'),
                              param_grid=[{'C': array([1.00000000e-04, 2.44843675e-04, 5.99484250e-04, 1.46779927e-03,
       3.59381366e-03, 8.79922544e-03, 2.15443469e-02, 5.27499706e-02,
       1.29154967e-01, 3.16227766e-01])}],
                              refit=False))])

In [369]:
lasso_hyper_parameter.named_steps['lasso'].best_params_['C']

0.0001

In [370]:
# above takes forever to run so allow for non convergence and pick the best toption
lasso_pipeline = Pipeline(steps=[
                                 ('drop_feats', sel.DropFeatures(features_to_drop=['Recency'])),
                                 ('scaler', StandardScaler()),
                                 # drop recency var as do not want it as a predictor
                                 ('lasso', LogisticRegression(random_state=0,
                                                              max_iter=5000,
                                                              C = lasso_hyper_parameter.named_steps['lasso'].best_params_['C'],
                                                              solver='saga',
                                                              penalty='l1'))
                                ])
lasso_pipeline.fit(X_train_surv, y_train_surv)

# the above takes forever! but it completes -> I think sklearns implementation is slower than h2os -> but why! why! it's simply lasso?!

Pipeline(steps=[('drop_feats', DropFeatures(features_to_drop=['Recency'])),
                ('scaler', StandardScaler()),
                ('lasso',
                 LogisticRegression(C=0.0001, max_iter=5000, penalty='l1',
                                    random_state=0, solver='saga'))])

In [371]:
lasso_pipeline.predict_log_proba(X_train_surv)[:,1]

array([-6.5820425, -6.5820425, -6.5820425, ..., -6.5820425, -6.5820425,
       -6.5820425])

In [372]:
X_train_preds = pd.DataFrame({'logit' : lasso_pipeline.predict_log_proba(X_train_surv)[:, 1]})
X_train_preds['TARGET'] = y_train_surv
X_train_preds.head()

Unnamed: 0,logit,TARGET
0,-6.582043,0
1,-6.582043,0
2,-6.582043,0
3,-6.582043,0
4,-6.582043,0


In [373]:
X_train_preds['PROB'] = np.exp(X_train_preds['logit'])
X_train_preds['PROB2'] = lasso_pipeline.predict_proba(X_train_surv)[:, 1]
X_train_preds.head()

Unnamed: 0,logit,TARGET,PROB,PROB2
0,-6.582043,0,0.001385,0.001385
1,-6.582043,0,0.001385,0.001385
2,-6.582043,0,0.001385,0.001385
3,-6.582043,0,0.001385,0.001385
4,-6.582043,0,0.001385,0.001385


In [374]:
X_train_preds['PROB'].min()

0.001385017499278144

In [375]:
X_train_preds['PROB'].max()

0.001385017499278144

In [376]:
X_train_preds['PROB2'].max()

0.0013850174992781443

In [377]:
len(X_train_preds['PROB'].unique())

1

In [378]:
len(X_train_preds['PROB2'].unique())

1

In [379]:
X_train_preds['PROB'] = X_train_preds['PROB'].round(4)
X_train_preds['PROB2'] = X_train_preds['PROB2'].round(4)
X_train_preds['DIFF'] = (X_train_preds['PROB2']-X_train_preds['PROB']).abs()

In [380]:
X_train_preds['DIFF'].value_counts()

0.0    101219
Name: DIFF, dtype: int64

Unnamed: 0,logit,TARGET,PROB,PROB2,DIFF
0,-81.838687,0,0.0,0.0,0.0
1,-80.997965,0,0.0,0.0,0.0
2,-80.157243,0,0.0,0.0,0.0
3,-79.316522,0,0.0,0.0,0.0
4,-78.4758,0,0.0,0.0,0.0


In [381]:
# i.e. the above literally is the log of the probs NOT the logit which is what you normally have in logistic regression!

In [382]:
from sklearn.metrics import roc_auc_score

In [383]:
roc_auc_score(y_train_surv, X_train_preds['PROB'])

0.5

In [351]:
# now the above is of the incorrect data which has the survival data
# we just want the prediction from the last observation that's all
class Save_Pipeline():
    def __init__(self):
        pass

save_pipeline=Save_Pipeline()

save_pipeline.data_prep_pipe = data_prep_pipe
save_pipeline.lasso_pipeline = lasso_pipeline

In [352]:
# I think h2o is faster than sklearn's lasso regression and it makes more sense! i.e. not this odd C parameter, which has a SVM interpretation
X_train_surv['pred_prob'] = lasso_pipeline.predict_log_proba(X_train_surv)[:, 1]

In [353]:
mask = list(X_train_surv.query('time==Recency').index)

In [354]:
10

10

In [355]:
X_train_surv.head()

Unnamed: 0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,cust_age,cust_tenure,Income_na,time,pred_prob
0,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0,0,-5.57849
1,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0,1,-5.572851
2,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0,2,-5.567213
3,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0,3,-5.561574
4,2,3,48789.0,2,2,94,351,16,156,7,5,145,3,4,4,10,4,0,0,0,0,0,0,9,9,0,4,-5.555935


In [356]:

roc_auc_score(y_train_surv.loc[mask], X_train_surv.loc[mask]['pred_prob'])

0.8514600962336154

In [357]:
# so the true roc is 0.90 for this simple example on train!

In [358]:
# now at test time need to at a new column to the data which was time! or replace Recency with time! also need things to be in the same column order!
# this is where things get complicated! in an unnennessary sense!

In [359]:
set(X_train_surv.columns).difference(set(X_test_surv))

{'pred_prob'}

In [361]:
10

10

In [362]:
X_test.shape

(224, 26)

In [363]:
X_train.shape

(2016, 26)

In [364]:
X_train_surv.shape

(101219, 28)

In [365]:
X_test_surv.shape

(11026, 27)

In [366]:
X_test_surv['pred_prob'] = lasso_pipeline.predict_log_proba(X_test_surv)[:, 1]
mask = list(X_test_surv.query('time==Recency').index)
roc_auc_score(y_test_surv.loc[mask], X_test_surv.loc[mask]['pred_prob'])

0.7843749999999999