## modules and lgb function 

In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
import matplotlib as mpl 
mpl.use('Agg')
import matplotlib.pyplot as plt
import os


def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.2,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])

    return (bst1,bst1.best_iteration)





In [4]:
debug, gcloud = [1, 0]



if gcloud:
    inputpath = '../data/'
else:
    inputpath = '../../'

if debug:
    val_size = 10000
    frm = 0
    nchunk = 100000
    test_nrows = 100000
else:
    nrows = 184903891 - 1  

    val_size = 2500000
    nchunk = 40000000
    # # nchunk = 90000000 #from 78000000
    frm = nrows - nchunk
    test_nrows = 18790470
    
to = frm + nchunk

# combination = [
#     # (
#     #     [
#     #         [['ip', 'channel'], 4],
#     #         [['app', 'channel'], 4],
#     #         [['ip', 'device', 'os', 'app'], 4],
#     #         [['ip', 'app', 'device', 'os'], 'NC']
#     #     ], 
#     #     [
#     #         # ['app', 'os'],
#     #         # ['app', 'channel'],
#     #         ['channel']
#     #     ],
#     #     'BC8',
#     #     ['ip', 'app','os', 'channel', 'device', 'hour'],
#     # ),
#     # (
#     #     [
#     #         [['ip', 'channel'], 4],
#     #         [['app', 'channel'], 4],
#     #         [['ip', 'device', 'os', 'app'], 4],
#     #         [['ip', 'app', 'device', 'os'], 'NC']
#     #     ], 
#     #     [
#     #         # ['app', 'os'],
#     #         ['app', 'channel'],
#     #         ['channel']
#     #     ],
#     #     'BC9',
#     #     ['ip', 'app','os', 'channel', 'device', 'hour'],
#     # ),
#     # (
#     #     [
#     #         [['ip', 'channel'], 4],
#     #         [['app', 'channel'], 4],
#     #         [['ip', 'device', 'os', 'app'], 4],
#     #         [['ip', 'app', 'device', 'os'], 'NC']
#     #     ], 
#     #     [
#     #         # ['app', 'os'],
#     #         ['app', 'channel'],
#     #         ['channel'],
#     #         ['app']
#     #     ],
#     #     'BC10',
#     #     ['ip', 'app','os', 'channel', 'device', 'hour'],
#     # ),    
#     (
#         [
#             [['ip', 'channel'], 4],
#             [['app', 'channel'], 4],
#             [['ip', 'device', 'os', 'app'], 4],
#             [['ip', 'app', 'device', 'os'], 'NC']
#         ], 
#         [
#             # ['app', 'os'],
#             ['app', 'channel'],
#             ['channel'],
#             ['app'],            
#             ['app', 'device']
#         ],
#         'BC11',
#         ['ip', 'app','os', 'channel', 'device', 'hour'],
#     ),  
#     (
#         [ 
#             [['ip', 'channel'], 4],
#             [['app', 'channel'], 4],
#             [['ip', 'device', 'os', 'app'], 4],
#             [['ip', 'app', 'device', 'os'], 'NC']
#         ], 
#         [
#             # ['app', 'os'],
#             ['app', 'channel'],
#             ['channel'],
#             ['app'],            
#             # ['app', 'device'],
#             ['ip', 'channel']            
#         ],
#         'BC12',
#         ['ip', 'app','os', 'channel', 'device', 'hour'],
#     )      

# ]




# for group, rategroups, fileno, initial_cols in combination:
#     sub=DO(frm,to,test_nrows, group, rategroups, fileno, initial_cols)

groups, rategroups, fileno, initial_cols = (
        [
            [['ip', 'channel'], 4],
            [['app', 'channel'], 4],
            [['ip', 'device', 'os', 'app'], 4],
            [['ip', 'app', 'device', 'os'], 'NC']
        ], 
        [
            # ['app', 'os'],
            ['app', 'channel'],
            ['channel'],
            ['app'],            
            ['app', 'device']
        ],
        'BC11',
        ['ip', 'app','os', 'channel', 'device', 'hour'],
    )

train_frm = frm
train_to = to

    
# def DO(train_frm,train_to, test_nrows, groups, rategroup, fileno, initial_cols=['ip', 'app','device','os', 'channel', 'hour']):


In [12]:
predictors=[]
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }
print('loading train data...',frm,to)
train_df = pd.read_csv(inputpath + "train.csv", parse_dates=['click_time'], skiprows=range(1,train_frm), nrows=train_to-train_frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

print('loading test data...')
# if debug:
#     test_df = pd.read_csv(inputpath+"test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
# else:
test_df = pd.read_csv(inputpath+"test.csv", nrows=test_nrows, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time'
                                                                ])



loading train data... 0 100000
loading test data...


In [6]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [7]:
test_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00
1,119901,9,1,3,466,2017-11-10 04:00:00
2,72287,21,1,19,128,2017-11-10 04:00:00
3,78477,15,1,13,111,2017-11-10 04:00:00
4,123080,12,1,13,328,2017-11-10 04:00:00


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
ip               100000 non-null uint32
app              100000 non-null uint16
device           100000 non-null uint16
os               100000 non-null uint16
channel          100000 non-null uint16
click_time       100000 non-null datetime64[ns]
is_attributed    100000 non-null uint8
dtypes: datetime64[ns](1), uint16(4), uint32(1), uint8(1)
memory usage: 2.0 MB


In [15]:
file = 'app_confRate.csv'
gp=pd.read_csv(filename)
# train_df = train_df.merge(gp, on=cols, how='left') 
# test_df = test_df.merge(gp, on=cols, how='left') 

In [17]:
gp.head()

Unnamed: 0.1,Unnamed: 0,app,device,app_device_confRate
0,0,0,0,0.0
1,1,1,1,0.0
2,2,1,2,0.0
3,3,1,59,0.0
4,4,2,1,0.000366


In [16]:


# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in rategroups:
    def rate_calculation(x):
        """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
        conf = np.min([1, np.log(x.count()) / log_group])
        return rate * conf        

    # New feature name
    new_feature = '_'.join(cols)+'_confRate'  
    predictors.append(new_feature)
    filename = new_feature + '.csv'
    if os.path.exists(filename):
        gp=pd.read_csv(filename)
        print('train cols before: ', train_df.columns)
        train_df = train_df.merge(gp, on=cols, how='left') 
        print('train cols after: ', test_df.columns)
        test_df = test_df.merge(gp, on=cols, how='left') 
        print('train cols before: ', test_df.columns)
        
    else:
        # Perform the groupby
        group_object = train_df.groupby(cols)

        # Group sizes    
        group_sizes = group_object.size()
        log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence 
        print(">> Calculating confidence-weighted rate for: {}.\n   Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format(
            cols, new_feature, 
            group_sizes.max(), 
            np.round(group_sizes.mean(), 2),
            np.round(group_sizes.median(), 2),
            group_sizes.min()
        ))

        # Aggregation function

        gp = group_object['is_attributed'].apply(rate_calculation).reset_index().rename( index=str, columns={'is_attributed': new_feature})[cols + [new_feature]]
        # Perform the merge
        print('train cols before: ', train_df.columns)
        train_df = train_df.merge(gp, on=cols, how='left')
        print('train cols after: ', train_df.columns)
        test_df = test_df.merge(gp, on=cols, how='left')
        gp.to_csv(filename, index=False)
        del gp 

print(train_df.shape)
gc.collect()

print('shape of train: ', train_df.shape)
print('shape of test: ', test_df.shape)
#     print('train.head: ')
#     print(train_df.head())
#     print('test head: ')
#     print(test_df.head())


# del test_df
# gc.collect()


train cols before:  Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed',
       'Unnamed: 0_x', 'app_channel_confRate', 'Unnamed: 0_y',
       'channel_confRate', 'Unnamed: 0_x', 'app_confRate', 'Unnamed: 0_y',
       'app_device_confRate'],
      dtype='object')
train cols after:  Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'Unnamed: 0_x',
       'app_channel_confRate', 'Unnamed: 0_y', 'channel_confRate',
       'Unnamed: 0_x', 'app_confRate', 'Unnamed: 0_y', 'app_device_confRate'],
      dtype='object')
train cols before:  Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'Unnamed: 0_x',
       'app_channel_confRate_x', 'Unnamed: 0_y', 'channel_confRate',
       'Unnamed: 0_x', 'app_confRate', 'Unnamed: 0_y', 'app_device_confRate',
       'Unnamed: 0', 'app_channel_confRate_y'],
      dtype='object')
train cols before:  Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed',
       'Unnamed: 0_x', 'app_channel

In [24]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 15 columns):
ip                      100000 non-null uint32
app                     100000 non-null uint16
device                  100000 non-null uint16
os                      100000 non-null uint16
channel                 100000 non-null uint16
click_time              100000 non-null datetime64[ns]
is_attributed           100000 non-null uint8
Unnamed: 0_x            100000 non-null int64
app_channel_confRate    100000 non-null float64
Unnamed: 0_y            100000 non-null int64
channel_confRate        100000 non-null float64
Unnamed: 0_x            100000 non-null int64
app_confRate            100000 non-null float64
Unnamed: 0_y            100000 non-null int64
app_device_confRate     100000 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(4), uint16(4), uint32(1), uint8(1)
memory usage: 8.9 MB


In [26]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
ip                      100000 non-null uint32
app                     100000 non-null uint16
device                  100000 non-null uint16
os                      100000 non-null uint16
channel                 100000 non-null uint16
click_time              100000 non-null datetime64[ns]
Unnamed: 0_x            89481 non-null float64
app_channel_confRate    89481 non-null float64
Unnamed: 0_y            96481 non-null float64
channel_confRate        96481 non-null float64
Unnamed: 0_x            99694 non-null float64
app_confRate            99694 non-null float64
Unnamed: 0_y            99362 non-null float64
app_device_confRate     99362 non-null float64
dtypes: datetime64[ns](1), float64(8), uint16(4), uint32(1)
memory usage: 8.8 MB


In [28]:
test_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,Unnamed: 0_x,app_channel_confRate,Unnamed: 0_y,channel_confRate,Unnamed: 0_x.1,app_confRate,Unnamed: 0_y.1,app_device_confRate
0,5744,9,1,3,107,2017-11-10 04:00:00,76.0,0.0,11.0,0.000508,8.0,0.000745,17.0,0.000789
1,119901,9,1,3,466,2017-11-10 04:00:00,89.0,0.0,118.0,0.0,8.0,0.000745,17.0,0.000789
2,72287,21,1,19,128,2017-11-10 04:00:00,193.0,0.0,25.0,0.00035,20.0,0.00035,80.0,0.000424
3,78477,15,1,13,111,2017-11-10 04:00:00,153.0,0.0,14.0,0.0,14.0,0.0,32.0,0.0
4,123080,12,1,13,328,2017-11-10 04:00:00,122.0,0.0,75.0,0.0,11.0,0.0,24.0,0.0


In [29]:
len_train = len(train_df)
# train_df=train_df.append(test_df)
# train_df = pd.concat([train_df, test_df], 0)

AssertionError: Number of manager items must equal union of block items
# manager items: 13, # tot_items: 15