# Load the dataset

In [None]:
import pandas as pd

train_dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
}

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']

train = pd.read_csv(
    '../input/train.csv',
    dtype=train_dtypes,
    engine='c',
    low_memory=True,
    usecols=train_cols,
#    nrows=5
)

In [None]:
train.head()

# Features train

In [None]:
train['hour'] = pd.to_datetime(train.click_time).dt.hour.astype('uint8')
train['day'] = pd.to_datetime(train.click_time).dt.day.astype('uint8')
train['second'] = pd.to_datetime(train.click_time).dt.second.astype('uint8')

train.head()

In [None]:
import ipaddress

train['ip_1'] = train['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[0]).astype('uint8')
train['ip_2'] = train['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[1]).astype('uint8')
train['ip_3'] = train['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[2]).astype('uint8')
train['ip_4'] = train['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[3]).astype('uint8')

train.head()

In [None]:
n_chans = (train[['ip','day','hour','channel']]
   .groupby(by=['ip','day', 'hour'])[['channel']]
   .count()
   .reset_index()
   .rename(columns={'channel': 'n_channels'})
  )

train = train.merge(n_chans, on=['ip','day','hour'], how='left')
train['n_channels'] = train['n_channels'].astype('uint16')

train.head()

In [None]:
n_chans = (train[['ip','app', 'channel']]
    .groupby(by=['ip', 'app'])[['channel']]
    .count()
    .reset_index()
    .rename(columns={'channel': 'ip_app_count'})
)
          
train = train.merge(n_chans, on=['ip','app'], how='left')
train['ip_app_count'] = train['ip_app_count'].astype('uint16')

train.head()

In [None]:
n_chans = (train[['ip','app', 'os', 'channel']]
   .groupby( by=['ip', 'app', 'os'])[['channel']]
   .count()
   .reset_index()
   .rename(columns={'channel': 'ip_app_os_count'})
)
              
train = train.merge(n_chans, on=['ip','app', 'os'], how='left')
train['ip_app_os_count'] = train['ip_app_os_count'].astype('uint16')

train.head()

In [None]:
train[[
    'ip', 'app', 'device','os', 'channel', 'is_attributed',
    'ip_1', 'ip_2', 'ip_3', 'ip_4', 'second',
    'hour', 'day', 'n_channels', 'ip_app_count', 'ip_app_os_count'
]].save_binary('../input/train_v1.bin')

In [None]:
predictors = [
    'ip', 'device', 'app', 'os', 'channel', 'hour', 'n_channels', 'ip_app_count', 'ip_app_os_count',
    'ip_1', 'ip_2', 'ip_3', 'ip_4', 'second', 'day'
]
categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_1', 'ip_2', 'ip_3', 'ip_4', 'second']

# Features test

In [None]:
test = pd.read_csv(
    '../input/test.csv'
#    , nrows=50000
)

test.head()

In [None]:
test['hour'] = pd.to_datetime(test.click_time).dt.hour.astype('uint8')
test['day'] = pd.to_datetime(test.click_time).dt.day.astype('uint8')
test['second'] = pd.to_datetime(test.click_time).dt.second.astype('uint8')

test.head()

In [None]:
import ipaddress

test['ip_1'] = test['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[0]).astype('uint8')
test['ip_2'] = test['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[1]).astype('uint8')
test['ip_3'] = test['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[2]).astype('uint8')
test['ip_4'] = test['ip'].map(lambda x: str(ipaddress.IPv4Address(x)).split('.')[3]).astype('uint8')

test.head()

In [None]:
n_chans = (test[['ip','day','hour','channel']]
   .groupby(by=['ip','day', 'hour'])[['channel']]
   .count()
   .reset_index()
   .rename(columns={'channel': 'n_channels'})
  )

test = test.merge(n_chans, on=['ip','day','hour'], how='left')
test['n_channels'] = test['n_channels'].astype('uint16')

test.head()

In [None]:
n_chans = (test[['ip','app', 'channel']]
    .groupby(by=['ip', 'app'])[['channel']]
    .count()
    .reset_index()
    .rename(columns={'channel': 'ip_app_count'})
)
          
test = test.merge(n_chans, on=['ip','app'], how='left')
test['ip_app_count'] = test['ip_app_count'].astype('uint16')

test.head()

In [None]:
n_chans = (test[['ip','app', 'os', 'channel']]
   .groupby( by=['ip', 'app', 'os'])[['channel']]
   .count()
   .reset_index()
   .rename(columns={'channel': 'ip_app_os_count'})
)
              
test = test.merge(n_chans, on=['ip','app', 'os'], how='left')
test['ip_app_os_count'] = test['ip_app_os_count'].astype('uint16')

test.head()

In [None]:
test[[
    'ip', 'app', 'device','os', 'channel', 'is_attributed', 
    'ip_1', 'ip_2', 'ip_3', 'ip_4', 'second',
    'hour', 'day', 'n_channels', 'ip_app_count', 'ip_app_os_count'
]].save_binary('../input/test.bin')