In [1]:
import torch
import pandas as pd
import sys
import gc

pd.set_option('display.max_column', 100)
pd.set_option('display.max_row', 100)

sys.path.append('../../../fastai/')
sys.path.append('../../')

# embeddings
from fastai.column_data import *
from fastai.structured import *

from pathlib import *

In [2]:
train_dtype_dict={"ip":"int32",
      "app":"int16",
      "device":"int16",
      "os":"int16",
      "channel":"int16",
      "is_attributed":"int8"}

test_dtype_dict={"ip":"int32",
      "app":"int16",
      "device":"int16",
      "os":"int16",
      "channel":"int16",
      "click_id":"int32"}

In [3]:
train_path =  "../../../data/talking/train_sample.csv"
test_path = "../../../data/talking/test.csv"

In [4]:
train = pd.read_csv(train_path,
                    parse_dates=["click_time", "attributed_time"],
                    skiprows=range(1,18700000),
                    dtype=train_dtype_dict)

In [4]:
test = pd.read_csv(test_path,
                   parse_dates=["click_time"],
                   #skiprows=range(1,18700000),
                   dtype=test_dtype_dict)

In [5]:
train.info("deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null int32
app                100000 non-null int16
device             100000 non-null int16
os                 100000 non-null int16
channel            100000 non-null int16
click_time         100000 non-null datetime64[ns]
attributed_time    227 non-null datetime64[ns]
is_attributed      100000 non-null int8
dtypes: datetime64[ns](2), int16(4), int32(1), int8(1)
memory usage: 2.8 MB


In [6]:
test.info("deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90470 entries, 0 to 90469
Data columns (total 7 columns):
click_id      90470 non-null int32
ip            90470 non-null int32
app           90470 non-null int16
device        90470 non-null int16
os            90470 non-null int16
channel       90470 non-null int16
click_time    90470 non-null datetime64[ns]
dtypes: datetime64[ns](1), int16(4), int32(2)
memory usage: 2.1 MB


In [7]:
test.click_id.tail()

90465    18790464
90466    18790465
90467    18790467
90468    18790466
90469    18790468
Name: click_id, dtype: int32

In [8]:
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']

In [9]:
merged = pd.concat([train[common_cols], test[common_cols]]) # use this when necessary

In [10]:
merged.info("deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190470 entries, 0 to 90469
Data columns (total 6 columns):
ip            190470 non-null int32
app           190470 non-null int16
device        190470 non-null int16
os            190470 non-null int16
channel       190470 non-null int16
click_time    190470 non-null datetime64[ns]
dtypes: datetime64[ns](1), int16(4), int32(1)
memory usage: 5.1 MB


In [11]:
# train data length to separate test
train_len = len(train)

In [12]:
# collec target
is_attributed = train.is_attributed

In [13]:
# delete if not needed
del train
del test
gc.collect()

223

#### 1) Get day-hour - datetime components other's are not necessary

In [14]:
def add_dayhour(data):
    # extract time information
    data['click_time'] = pd.to_datetime(data['click_time'])
    data['click_timeHour'] = data.click_time.dt.hour.astype("int8")
    data['click_timeDay'] = data.click_time.dt.day.astype("int8")
    data.drop('click_time', 1, inplace=True)
    return data

In [15]:
merged = add_dayhour(merged)

In [16]:
merged.head()

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,87540,12,1,13,497,9,7
1,105560,25,1,17,259,13,7
2,101424,12,1,19,212,18,7
3,94584,13,1,13,477,4,7
4,68413,12,1,1,178,9,9


In [17]:
merged.info("deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190470 entries, 0 to 90469
Data columns (total 7 columns):
ip                190470 non-null int32
app               190470 non-null int16
device            190470 non-null int16
os                190470 non-null int16
channel           190470 non-null int16
click_timeHour    190470 non-null int8
click_timeDay     190470 non-null int8
dtypes: int16(4), int32(1), int8(2)
memory usage: 4.0 MB


#### 2) normalize all cats - cats should have indexes starting from 0

In [18]:
cats = ['ip', 'app', 'device', 'os', 'channel',
'click_timeDay', 'click_timeHour']

In [19]:
def norm_cats(data, cats, dtype="int16"):
    for c in cats:
        cat2emb = {v:k for k, v in enumerate(data[c].unique())}
        data[c] = data[c].map(cat2emb).astype(dtype)
    return data

In [20]:
merged = norm_cats(merged, cats)

In [21]:
merged.head()

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0
2,2,0,0,2,2,2,0
3,3,2,0,0,3,3,0
4,4,0,0,3,4,0,1


In [22]:
merged.info("deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190470 entries, 0 to 90469
Data columns (total 7 columns):
ip                190470 non-null int16
app               190470 non-null int16
device            190470 non-null int16
os                190470 non-null int16
channel           190470 non-null int16
click_timeHour    190470 non-null int16
click_timeDay     190470 non-null int16
dtypes: int16(7)
memory usage: 4.0 MB


#### 3) Separate data to train, val and test

In [23]:
merged.click_timeDay.unique()

array([0, 1, 2, 3, 4])

In [24]:
merged.click_timeHour.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])

In [25]:
# get test
test = merged[train_len:].reset_index(drop=True)
rest = merged[:train_len].reset_index(drop=True)

In [26]:
# put target back
rest["is_attributed"] = is_attributed

In [27]:
rest.dtypes

ip                int16
app               int16
device            int16
os                int16
channel           int16
click_timeHour    int16
click_timeDay     int16
is_attributed      int8
dtype: object

In [28]:
test.dtypes

ip                int16
app               int16
device            int16
os                int16
channel           int16
click_timeHour    int16
click_timeDay     int16
dtype: object

In [29]:
train_lower_limits = (2, 10) # from day 2 hour 10 including
train_upper_limits = (3, 18) # to day 3 hour 18 including

In [30]:
train_msk = ((rest.click_timeDay >= train_lower_limits[0]) & (rest.click_timeHour >= train_lower_limits[1]))\
&\
((rest.click_timeDay <= train_upper_limits[0]) & (rest.click_timeHour <= train_upper_limits[1]))

val_lower_limits = (3, 19) # from day 2 hour 10 including
val_upper_limits = (4, 23) # to day 3 hour 18 including

val_msk = ((rest.click_timeDay >= val_lower_limits[0]) & (rest.click_timeHour >= val_lower_limits[1]))\
&\
((rest.click_timeDay <= val_upper_limits[0]) & (rest.click_timeHour <= val_upper_limits[1]))

In [31]:
# get test, train, val 
train = rest[train_msk]
val = rest[val_msk]

In [32]:
print(f"Rows train: {len(train)}, val: {len(val)}, test: {len(test)}")

Rows train: 15897, val: 1210, test: 90470


In [33]:
# delete merged
del merged
del rest
gc.collect()

122

In [34]:
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [35]:
train_val = pd.concat([train, val]).reset_index(drop=True)

In [36]:
train.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed
0,15,1,1,2,1,11,2,0


In [37]:
val.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed
0,328,13,0,0,51,23,3,0


In [38]:
test.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay
0,5337,0,0,47,79,9,4


In [39]:
train_val.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed
0,15,1,1,2,1,11,2,0


#### 4) Target Mean-Count Encoding

In [66]:
len(is_attributed)

100000

In [67]:
len(train), sum(train_msk)

(15897, 15897)

In [68]:
from sklearn.model_selection import KFold

In [69]:
def reg_mean_encoding(train, col, new_col, target, splits=10, seed=42, dtype="float32"):
    """ Computes regularize mean encoding.
        Use this to create mean encoding for training data
        
    Inputs:
        train: training dataframe
        col: a single column as string or list of columns to groupby 
        during mean target encoding
        new_col: name of new created column
        splits: splits to use for cv
    Returns:
        train: dataframe with new column added
    """
    # single column to groupby
    train[new_col] = 0
    if isinstance(col, str):
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            groups = train.iloc[trn_idx].groupby(col)[target].mean()
            train.loc[val_idx, new_col] = train.loc[val_idx, col].map(groups)
            
    # multiple columns to groupby
    elif isinstance(col, list):        
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            stats = train.iloc[trn_idx].groupby(col)[target].mean().reset_index()
            vals = pd.merge(train.iloc[val_idx], stats, "left", on=col, suffixes=["_", ""])[target]
            vals.index = val_idx
            train.loc[val_idx, new_col] = vals
    
    train[new_col].fillna(train[new_col].mean(), inplace=True)
    train[new_col] = train[new_col].astype(dtype)
    return train

In [70]:
def reg_mean_encoding_test(test, train, col, new_col, target, dtype="float32"):
    """ Computes target enconding for test data.
        Use this to create mean encoding for valdiation and test data
        Inputs:
            train: training dataframe to compute means
            test: training dataframe to create new column
            col: a single column as string or list of columns
            new_col: name of new created column
        Returns:
            test: dataframe with new column added
    This is similar to how we do validation
    """
    # single column to groupby
    test[new_col] = 0
    if isinstance(col, str):
        test[new_col] = test[col].map(train.groupby(col)[target].mean())
        test[new_col].fillna(train[target].mean(), inplace=True)
    # multiple columns to groupby
    elif isinstance(col, list):
        stats = train.groupby(col)[target].mean().reset_index()
        vals = pd.merge(test, stats, "left", on=col, suffixes=["_", ""])[target]
        test[new_col] = vals
    
    test[new_col].fillna(train[target].mean(), inplace=True)
    test[new_col] = test[new_col].astype(dtype)
    return test

In [71]:
# may add more random columns or column combinations
random_encoding_cols = \
    ['ip', 'app', 'device', 'os', 'channel', 'click_timeHour', 'click_timeDay']+\
    [['ip','click_timeDay','click_timeHour'], ['ip','app'], ['ip','app', 'os'], ['ip','app', 'click_timeHour']]


    
for c in random_encoding_cols:
    if isinstance(c, list): name = "_".join(c)
    else: name = c
    # regularized mean encoding for train
    train = reg_mean_encoding(train,
                             c,
                             f'random_mean_encode_{name}',
                             'is_attributed')
    #print("done1")
    # regularized mean encoding fo validation
    val = reg_mean_encoding_test(val,
                                 train,
                                 c,
                                 f'random_mean_encode_{name}',
                                 'is_attributed')
    #print("done2")
    # regularized mean encoding fo test
    test = reg_mean_encoding_test(test,
                                 train_val,
                                 c,
                                 f'random_mean_encode_{name}',
                                 'is_attributed')
    #print("done3")
    # encodings for full train: train + val
    train_val = reg_mean_encoding(train_val,
                             c,
                             f'random_mean_encode_{name}',
                             'is_attributed')
    #print("done4")

In [72]:
def reg_count_encoding(train, col, new_col, target, splits=10, seed=42, dtype="int16"):
    """ Computes regularize count encoding.
        Use this to create count encoding for training data
        
    Inputs:
        train: training dataframe
        col: a single column as string or list of columns to groupby 
        during count target encoding
        new_col: name of new created column
        splits: splits to use for cv
    Returns:
        train: dataframe with new column added
    """
    # single column to groupby
    train[new_col] = 0
    if isinstance(col, str):
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            groups = train.iloc[trn_idx].groupby(col)[target].count()
            train.loc[val_idx, new_col] = train.loc[val_idx, col].map(groups)

    # multiple columns to groupby
    elif isinstance(col, list):        
        for split, (trn_idx, val_idx) in enumerate(KFold(splits, shuffle=True, random_state=seed).split(train)):
            stats = train.iloc[trn_idx].groupby(col)[target].count().reset_index()
            vals = pd.merge(train.iloc[val_idx], stats, "left", on=col, suffixes=["_", ""])[target]
            vals.index = val_idx
            train.loc[val_idx, new_col] = vals 
    
    train[new_col].fillna(train[new_col].count(), inplace=True)
    train[new_col] = train[new_col].astype(dtype)
    return train

In [73]:
def reg_count_encoding_test(test, train, col, new_col, target, dtype="int16"):
    """ Computes target enconding for test data.
        Use this to create count encoding for valdiation and test data
        Inputs:
            train: training dataframe to compute counts
            test: training dataframe to create new column
            col: a single column as string or list of columns
            new_col: name of new created column
        Returns:
            test: dataframe with new column added
    This is similar to how we do validation
    """
    # single column to groupby
    test[new_col] = 0
    if isinstance(col, str):
        test[new_col] = test[col].map(train.groupby(col)[target].count())
        test[new_col].fillna(train[target].count(), inplace=True)
    # multiple columns to groupby
    elif isinstance(col, list):
        stats = train.groupby(col)[target].count().reset_index()
        vals = pd.merge(test, stats, "left", on=col, suffixes=["_", ""])[target]
        test[new_col] = vals 
    
    test[new_col].fillna(train[target].count(), inplace=True)
    test[new_col] = test[new_col].astype(dtype)
    return test

In [74]:
# may add more random columns or column combinations
random_encoding_cols = \
    ['ip', 'app', 'device', 'os', 'channel', 'click_timeHour', 'click_timeDay']+\
    [['ip','click_timeDay','click_timeHour'], ['ip','app'], ['ip','app', 'os'], ['ip','app', 'click_timeHour']]

for c in random_encoding_cols:
    if isinstance(c, list): name = "_".join(c)
    else: name = c
    # regularized count encoding for train
    train = reg_count_encoding(train,
                             c,
                             f'random_count_encode_{name}',
                             'is_attributed')
    #print("done")
    # regularized count encoding for validation
    val = reg_count_encoding_test(val,
                                 train,
                                 c,
                                 f'random_count_encode_{name}',
                                 'is_attributed')
    #print("done")count encoding for test
    test = reg_count_encoding_test(test,
                                 train_val,
                                 c,
                                 f'random_count_encode_{name}',
                                 'is_attributed')
    
    # encodings for full train: train + val
    train_val = reg_count_encoding(train_val,
                             c,
                             f'random_count_encode_{name}',
                             'is_attributed')
    #print("done")

In [None]:
dst = "../../../data/talking/"
train.to_feather(dst+"train_prepd.feather")
val.to_feather(dst+"val_prepd.feather")
train_val.to_feather(dst+"train_val_prepd.feather")
test.to_feather(dst+"test_prepd.feather")

In [81]:
train_val.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed,random_mean_encode_ip,random_mean_encode_app,random_mean_encode_device,random_mean_encode_os,random_mean_encode_channel,random_mean_encode_click_timeHour,random_mean_encode_click_timeDay,random_mean_encode_ip_click_timeDay_click_timeHour,random_mean_encode_ip_app,random_mean_encode_ip_app_os,random_mean_encode_ip_app_click_timeHour,random_count_encode_ip,random_count_encode_app,random_count_encode_device,random_count_encode_os,random_count_encode_channel,random_count_encode_click_timeHour,random_count_encode_click_timeDay,random_count_encode_ip_click_timeDay_click_timeHour,random_count_encode_ip_app,random_count_encode_ip_app_os,random_count_encode_ip_app_click_timeHour
0,15,1,1,2,1,11,2,0,0.001294,0.0,0.0,0.002104,0.0,0.003091,0.001775,0.000451,0.0,0.0,0.0,8114,125,520,3802,489,2265,11267,2216,2272,543,559


In [78]:
train.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed,random_mean_encode_ip,random_mean_encode_app,random_mean_encode_device,random_mean_encode_os,random_mean_encode_channel,random_mean_encode_click_timeHour,random_mean_encode_click_timeDay,random_mean_encode_ip_click_timeDay_click_timeHour,random_mean_encode_ip_app,random_mean_encode_ip_app_os,random_mean_encode_ip_app_click_timeHour,random_count_encode_ip,random_count_encode_app,random_count_encode_device,random_count_encode_os,random_count_encode_channel,random_count_encode_click_timeHour,random_count_encode_click_timeDay,random_count_encode_ip_click_timeDay_click_timeHour,random_count_encode_ip_app,random_count_encode_ip_app_os,random_count_encode_ip_app_click_timeHour
0,15,1,1,2,1,11,2,0,0.001141,0.0,0.0,0.001679,0.0,0.002213,0.001955,0.000469,0.0,0.0,0.0,7325,112,475,3574,438,2259,11255,2130,2061,480,554


In [79]:
val.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,is_attributed,random_mean_encode_ip,random_mean_encode_app,random_mean_encode_device,random_mean_encode_os,random_mean_encode_channel,random_mean_encode_click_timeHour,random_mean_encode_click_timeDay,random_mean_encode_ip_click_timeDay_click_timeHour,random_mean_encode_ip_app,random_mean_encode_ip_app_os,random_mean_encode_ip_app_click_timeHour,random_count_encode_ip,random_count_encode_app,random_count_encode_device,random_count_encode_os,random_count_encode_channel,random_count_encode_click_timeHour,random_count_encode_click_timeDay,random_count_encode_ip_click_timeDay_click_timeHour,random_count_encode_ip_app,random_count_encode_ip_app_os,random_count_encode_ip_app_click_timeHour
0,328,13,0,0,51,23,3,0,0.0,0.001512,0.00158,0.001149,0.0,0.002076,0.001771,0.002076,0.002076,0.002076,0.002076,2,1323,15193,3481,516,15897,3387,15897,15897,15897,15897


In [80]:
test.head(1)

Unnamed: 0,ip,app,device,os,channel,click_timeHour,click_timeDay,random_mean_encode_ip,random_mean_encode_app,random_mean_encode_device,random_mean_encode_os,random_mean_encode_channel,random_mean_encode_click_timeHour,random_mean_encode_click_timeDay,random_mean_encode_ip_click_timeDay_click_timeHour,random_mean_encode_ip_app,random_mean_encode_ip_app_os,random_mean_encode_ip_app_click_timeHour,random_count_encode_ip,random_count_encode_app,random_count_encode_device,random_count_encode_os,random_count_encode_channel,random_count_encode_click_timeHour,random_count_encode_click_timeDay,random_count_encode_ip_click_timeDay_click_timeHour,random_count_encode_ip_app,random_count_encode_ip_app_os,random_count_encode_ip_app_click_timeHour
0,5337,0,0,47,79,9,4,0.002046,0.000448,0.001594,0.055556,0.0,0.002046,0.002046,0.002046,0.002046,0.002046,0.002046,17107,2232,16316,18,72,17107,17107,17107,17107,17107,17107
