In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

In [4]:
clicks = pd.read_parquet('/Users/gurupratap.matharu/Downloads/feature-engineering-data/baseline_data.pqt')
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,6,15,13,23
1,110007,35,1,13,10,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7
2,1047,6,1,13,157,2017-11-06 15:42:32,,0,6,15,42,32
3,76270,3,1,13,120,2017-11-06 15:56:17,,0,6,15,56,17
4,57862,3,1,13,120,2017-11-06 15:57:01,,0,6,15,57,1


In [6]:
print(f"We have a dataframe with {len(clicks)} records!")

We have a dataframe with 2300561 records!


Utility function to split the dataset

In [7]:
def get_data_splits(dataframe, valid_fraction=0.1):
    """
    Splits a dataframe into train, validation and test sets. First, order by
    the column 'click_time'. Sets the size of the validation and test sets with 
    the valid_fraction keyword argument.
    """
    
    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    
    train = dataframe[: -2 * valid_rows]
    valid = dataframe[-2 * valid_rows: -valid_rows]
    test = dataframe[ -valid_rows:]
    
    return train, valid, test

Function to train our LGBM model

In [10]:
def train_model(train, valid, test=None, feature_cols=None):
    """
    Trains a light gbm model and evaluates it score over the valid and test sets.
    """
    
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time', 'is_attributed'])
    
    # create lgbm datasets
    
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7}
    num_round = 1000
    
    print("Training model...hold on")
    
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    
    print(f"Validation AUC score: {valid_score}")
          
    if test is not None:
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        print(f"Test AUC score: {test_score}")
        return bst, valid_score, test_score
    else:
        return bst, valid_score

Let's check our baseline model now

In [13]:
print("Baseline model")
train, valid, test = get_data_splits(clicks)
_ = train_model(train=train, valid=valid)

Baseline model
Training model...hold on
Validation AUC score: 0.9622743228943659


Let's try to add `count encodings` 

1. encode each categorical feature using the `count` of each value in the dataset
2. let's use `CountEncoder` from the `category_encoders` library
3. let's learn the encodings only from the train set to avoid data leakage
4. then we'll apply the encodings and add new features with a `_count` suffix to our original dataset

In [14]:
import category_encoders as ce

# 1. set aside categorical features
cat_features = ['ip', 'app', 'device', 'os', 'channel']

# 2. instantiate the count encoder
count_encoder = ce.CountEncoder(cols=cat_features)

# 3. Split the dataset before hand
train, valid, test = get_data_splits(clicks)

# 4. Fit only on the train set to avoid data leakage
count_encoder.fit(train[cat_features])

# 5. Do the transformation on both train and valid sets and join them with a suffix
train_encoded = train.join(count_encoder.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_encoder.transform(valid[cat_features]).add_suffix('_count'))

Let's check our new features. How do they look?

In [16]:
train_encoded.head(1)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,ip_count,app_count,device_count,os_count,channel_count
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,6,15,13,23,68,292254,1648091,370652,26760


Let's retrain our model to see if accuracy improves...

In [17]:
_ = train_model(train=train_encoded, valid=valid_encoded)

Training model...hold on
Validation AUC score: 0.9653051135205329


Congratulations! There is a slight improvement and any improvement is welcome :)

### Target Encoding

Now we will avoid the `ip` feature. Why?
- it has a lot of variance
- its a highly predictive feature so model depends heavily on it
- for newer data we will get unknown ip addresses and our model will not benefit from them
- lot of values have single or very few instances in our dataset

In [22]:
import category_encoders as ce

# 1. set aside categorical features
cat_features = ['app', 'device', 'os', 'channel']

# 2. instantiate the count encoder
target_encoder = ce.TargetEncoder(cols=cat_features)

# 3. Split the dataset before hand
train, valid, test = get_data_splits(clicks)

# 4. Fit only on the train set to avoid data leakage, also supply the label feature. IMP!!!
target_encoder.fit(X=train[cat_features], y=train['is_attributed'])

# 5. Do the transformation on both train and valid sets and join them with a suffix
train_encoded = train.join(target_encoder.transform(train[cat_features]).add_suffix('_target'))
valid_encoded = valid.join(target_encoder.transform(valid[cat_features]).add_suffix('_target'))

Nice! So now let's re train our lgbm model to see how it performs...

In [23]:
_ = train_model(train=train_encoded, valid=valid_encoded)

Training model...hold on
Validation AUC score: 0.9627457957514338


Ever so slight improvement! Again good

### CatBoost Encoding

In [24]:
import category_encoders as ce

# 1. set aside categorical features
cat_features = ['app', 'device', 'os', 'channel']

# 2. instantiate the count encoder
catboost_encoder = ce.CatBoostEncoder(cols=cat_features, random_state=42)

# 3. Split the dataset before hand
train, valid, test = get_data_splits(clicks)

# 4. Fit only on the train set to avoid data leakage, also supply the label feature. IMP!!!
catboost_encoder.fit(X=train[cat_features], y=train['is_attributed'])

# 5. Do the transformation on both train and valid sets and join them with a suffix
train_encoded = train.join(catboost_encoder.transform(train[cat_features]).add_suffix('_cb'))
valid_encoded = valid.join(catboost_encoder.transform(valid[cat_features]).add_suffix('_cb'))

In [25]:
_ = train_model(train=train_encoded, valid=valid_encoded)

Training model...hold on
Validation AUC score: 0.962868024575231


Okay catboost gave the best score. let's use it to transform our `clicks` dataset now...

In [26]:
clicks = clicks.join(catboost_encoder.transform(clicks[cat_features]).add_suffix('_cb'))
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,app_cb,device_cb,os_cb,channel_cb
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,6,15,13,23,0.028329,0.152087,0.138712,0.034049
1,110007,35,1,13,10,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7,0.995828,0.152087,0.138712,0.950244
2,1047,6,1,13,157,2017-11-06 15:42:32,,0,6,15,42,32,0.009261,0.152087,0.138712,0.019384
3,76270,3,1,13,120,2017-11-06 15:56:17,,0,6,15,56,17,0.028329,0.152087,0.138712,0.034049
4,57862,3,1,13,120,2017-11-06 15:57:01,,0,6,15,57,1,0.028329,0.152087,0.138712,0.034049
