In [1]:
import torch
import pandas as pd
import sys
import gc

pd.set_option('display.max_column', 100)
pd.set_option('display.max_row', 100)

sys.path.append('../../../fastai/')
sys.path.append('../../')

# embeddings
from fastai.column_data import *
from fastai.structured import *

from pathlib import *

In [2]:
train_dtype_dict={"ip":"int32",
      "app":"int16",
      "device":"int16",
      "os":"int16",
      "channel":"int16",
      "is_attributed":"int8"}

test_dtype_dict={"ip":"int32",
      "app":"int16",
      "device":"int16",
      "os":"int16",
      "channel":"int16",
      "click_id":"int32"}

In [3]:
train = pd.read_csv("../../../data/talking/train_sample.csv",
                    parse_dates=["click_time", "attributed_time"],
                    dtype=train_dtype_dict)

In [4]:
test = pd.read_csv("../../../data/talking/test.csv",
                   parse_dates=["click_time"],
                   skiprows=range(1,18700000),
                   dtype=test_dtype_dict)

In [5]:
train.info("deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null int32
app                100000 non-null int16
device             100000 non-null int16
os                 100000 non-null int16
channel            100000 non-null int16
click_time         100000 non-null datetime64[ns]
attributed_time    227 non-null datetime64[ns]
is_attributed      100000 non-null int8
dtypes: datetime64[ns](2), int16(4), int32(1), int8(1)
memory usage: 2.8 MB


In [6]:
test.info("deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90470 entries, 0 to 90469
Data columns (total 7 columns):
click_id      90470 non-null int32
ip            90470 non-null int32
app           90470 non-null int16
device        90470 non-null int16
os            90470 non-null int16
channel       90470 non-null int16
click_time    90470 non-null datetime64[ns]
dtypes: datetime64[ns](1), int16(4), int32(2)
memory usage: 2.1 MB


In [7]:
test.click_id.tail()

90465    18790464
90466    18790465
90467    18790467
90468    18790466
90469    18790468
Name: click_id, dtype: int32

In [8]:
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

In [9]:
merged = pd.concat([train[common_cols], test[common_cols]]) # use this when necessary

In [10]:
merged.info("deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190470 entries, 0 to 90469
Data columns (total 6 columns):
ip            190470 non-null int32
app           190470 non-null int16
device        190470 non-null int16
os            190470 non-null int16
channel       190470 non-null int16
click_time    190470 non-null datetime64[ns]
dtypes: datetime64[ns](1), int16(4), int32(1)
memory usage: 5.1 MB


In [11]:
# train data length to separate test
train_len = len(train)

In [12]:
# collec target
is_attributed = train.is_attributed

In [13]:
# delete if not needed
del train
del test
gc.collect()

223

#### 1) Get day-hour - datetime components other's are not necessary

In [14]:
def add_dayhour(data):
    # extract time information
    data['click_time'] = pd.to_datetime(data['click_time'])
    data['click_timeHour'] = data.click_time.dt.hour.astype("int8")
    data['click_timeDay'] = data.click_time.dt.day.astype("int8")
    data.drop('click_time', 1, inplace=True)
    return data

In [15]:
merged = add_dayhour(merged)

In [16]:
merged.head()

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,87540,12,1,13,497,9,7
1,105560,25,1,17,259,13,7
2,101424,12,1,19,212,18,7
3,94584,13,1,13,477,4,7
4,68413,12,1,1,178,9,9


#### 2) normalize all cats - cats should have indexes starting from 0

In [17]:
cats = ['ip', 'app', 'device', 'os', 'channel',
'click_timeDay', 'click_timeHour']

In [18]:
def norm_cats(data, cats):
    for c in cats:
        cat2emb = {v:k for k, v in enumerate(data[c].unique())}
        data[c] = data[c].map(cat2emb)
    return data

In [19]:
merged = norm_cats(merged, cats)

In [20]:
merged.head()

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0
2,2,0,0,2,2,2,0
3,3,2,0,0,3,3,0
4,4,0,0,3,4,0,1


#### 3) Separate data to train, val and test

In [21]:
merged.click_timeDay.unique()

array([0, 1, 2, 3, 4])

In [22]:
merged.click_timeHour.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])

In [23]:
train_lower_limits = (2, 10) # from day 2 hour 10 including
train_upper_limits = (3, 18) # to day 3 hour 18 including

train_msk = ((merged.click_timeDay >= train_lower_limits[0]) & (merged.click_timeHour >= train_lower_limits[1]))\
&\
((merged.click_timeDay <= train_upper_limits[0]) & (merged.click_timeHour <= train_upper_limits[1]))

val_lower_limits = (3, 19) # from day 2 hour 10 including
val_upper_limits = (4, 23) # to day 3 hour 18 including

val_msk = ((merged.click_timeDay >= val_lower_limits[0]) & (merged.click_timeHour >= val_lower_limits[1]))\
&\
((merged.click_timeDay <= val_upper_limits[0]) & (merged.click_timeHour <= val_upper_limits[1]))

In [24]:
# get test, train, val 
test = merged[train_len:]
train = merged[train_msk]
val = merged[val_msk]

In [25]:
print(f"Rows train: {len(train)}, val: {len(val)}, test: {len(test)}")

Rows train: 15897, val: 1709, test: 90470


In [26]:
# delete merged
del merged
gc.collect()

16

In [27]:
train.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
15,15,1,1,2,1,11,2


In [28]:
val.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
337,328,13,0,0,51,23,3


In [29]:
test.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,5337,0,0,47,79,9,4


#### 4) Target Mean-Count Encoding

In [30]:
# plug target back for calculations
train["is_attributed"] = is_attributed

In [31]:
from sklearn.model_selection import KFold

In [32]:
def reg_mean_encoding(train, col, new_col, target, splits=10, seed=42):
    """ Computes regularize mean encoding.
        Use this to create mean encoding for training data
        
    Inputs:
        train: training dataframe
        col: a single column as string or list of columns to groupby 
        during mean target encoding
        new_col: name of new created column
        splits: splits to use for cv
    Returns:
        train: dataframe with new column added
    """
    # single column to groupby
    train[new_col] = 0
    if isinstance(col, str):
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            groups = train.iloc[trn_idx].groupby(col)[target].mean()
            train.loc[val_idx, new_col] = train.loc[val_idx, col].map(groups)

    # multiple columns to groupby
    elif isinstance(col, list):        
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            stats = train.iloc[trn_idx].groupby(col)[target].mean().reset_index()
            vals = pd.merge(train.iloc[val_idx], stats, "left", on=col, suffixes=["_", ""])[target]
            vals.index = val_idx
            train.loc[val_idx, new_col] = vals 
    
    train[new_col].fillna(train[new_col].mean(), inplace=True)
    return train

In [33]:
def reg_mean_encoding_test(test, train, col, new_col, target):
    """ Computes target enconding for test data.
        Use this to create mean encoding for valdiation and test data
        Inputs:
            train: training dataframe to compute means
            test: training dataframe to create new column
            col: a single column as string or list of columns
            new_col: name of new created column
        Returns:
            test: dataframe with new column added
    This is similar to how we do validation
    """
    # single column to groupby
    test[new_col] = 0
    if isinstance(col, str):
        test[new_col] = test[col].map(train.groupby(col)[target].mean())
        test[new_col].fillna(train[target].mean(), inplace=True)
    # multiple columns to groupby
    elif isinstance(col, list):
        stats = train.groupby(col)[target].mean().reset_index()
        vals = pd.merge(test, stats, "left", on=col, suffixes=["_", ""])[target]
        test[new_col] = vals 
    
    test[new_col].fillna(train[target].mean(), inplace=True)
    return test

In [34]:
# may add more random columns or column combinations
random_encoding_cols = \
    ['ip', 'app', 'device', 'os', 'channel', 'click_timeHour', 'click_timeDay']+\
    [['ip','click_timeDay','click_timeHour'], ['ip','app'], ['ip','app', 'os'], ['ip','app', 'click_timeHour']]

for c in random_encoding_cols:
    if isinstance(c, list): name = "_".join(c)
    else: name = c
    # regularized mean encoding for train
    train = reg_mean_encoding(train.reset_index(drop=True),
                             c,
                             f'random_mean_encode_{name}',
                             'is_attributed')
    #print("done")
    # regularized mean encoding fo validation
    val = reg_mean_encoding_test(val.reset_index(drop=True),
                                 train.reset_index(drop=True),
                                 c,
                                 f'random_mean_encode_{name}',
                                 'is_attributed')
    #print("done")
    # regularized mean encoding fo test
    val = reg_mean_encoding_test(test,
                                 pd.concat([train.reset_index(drop=True), val.reset_index(drop=True)]),
                                 c,
                                 f'random_mean_encode_{name}',
                                 'is_attributed')
    #print("done")

In [35]:
def reg_count_encoding(train, col, new_col, target, splits=10, seed=42):
    """ Computes regularize count encoding.
        Use this to create count encoding for training data
        
    Inputs:
        train: training dataframe
        col: a single column as string or list of columns to groupby 
        during count target encoding
        new_col: name of new created column
        splits: splits to use for cv
    Returns:
        train: dataframe with new column added
    """
    # single column to groupby
    train[new_col] = 0
    if isinstance(col, str):
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            groups = train.iloc[trn_idx].groupby(col)[target].count()
            train.loc[val_idx, new_col] = train.loc[val_idx, col].map(groups)

    # multiple columns to groupby
    elif isinstance(col, list):        
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            stats = train.iloc[trn_idx].groupby(col)[target].count().reset_index()
            vals = pd.merge(train.iloc[val_idx], stats, "left", on=col, suffixes=["_", ""])[target]
            vals.index = val_idx
            train.loc[val_idx, new_col] = vals 
    
    train[new_col].fillna(train[new_col].count(), inplace=True)
    return train

In [36]:
def reg_count_encoding_test(test, train, col, new_col, target):
    """ Computes target enconding for test data.
        Use this to create count encoding for valdiation and test data
        Inputs:
            train: training dataframe to compute counts
            test: training dataframe to create new column
            col: a single column as string or list of columns
            new_col: name of new created column
        Returns:
            test: dataframe with new column added
    This is similar to how we do validation
    """
    # single column to groupby
    test[new_col] = 0
    if isinstance(col, str):
        test[new_col] = test[col].map(train.groupby(col)[target].count())
        test[new_col].fillna(train[target].count(), inplace=True)
    # multiple columns to groupby
    elif isinstance(col, list):
        stats = train.groupby(col)[target].count().reset_index()
        vals = pd.merge(test, stats, "left", on=col, suffixes=["_", ""])[target]
        test[new_col] = vals 
    
    test[new_col].fillna(train[target].count(), inplace=True)
    return test

In [37]:
# may add more random columns or column combinations
random_encoding_cols = \
    ['ip', 'app', 'device', 'os', 'channel', 'click_timeHour', 'click_timeDay']+\
    [['ip','click_timeDay','click_timeHour'], ['ip','app'], ['ip','app', 'os'], ['ip','app', 'click_timeHour']]

for c in random_encoding_cols:
    if isinstance(c, list): name = "_".join(c)
    else: name = c
    # regularized count encoding for train
    train = reg_count_encoding(train.reset_index(drop=True),
                             c,
                             f'random_count_encode_{name}',
                             'is_attributed')
    #print("done")
    # regularized count encoding fo validation
    val = reg_mean_encoding_test(val.reset_index(drop=True),
                                 train.reset_index(drop=True),
                                 c,
                                 f'random_count_encode_{name}',
                                 'is_attributed')
    #print("done")count encoding fo test
    test = reg_mean_encoding_test(test.reset_index(drop=True),
                                 pd.concat([train.reset_index(drop=True), val.reset_index(drop=True)]),
                                 c,
                                 f'random_count_encode_{name}',
                                 'is_attributed')
    #print("done")

In [39]:
train.head(1).T

Unnamed: 0,0
ip,15.0
app,1.0
device,1.0
os,2.0
channel,1.0
click_timeHour,11.0
click_timeDay,2.0
is_attributed,0.0
random_mean_encode_ip,0.001141
random_mean_encode_app,0.0
