In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import time
import numpy as np
import lightgbm as lgb
import os
os.environ['OMP_NUM_THREADS'] = '4'

max_rounds = 1000
early_stop = 50
opt_rounds = 680

output_file = 'lgbm_submit3.csv'

path = "/home/luanhongwei/talk/"

dtypes = {
    'ip'		:'uint32',
    'app'		:'uint16',
	'device'	:'uint16',
	'os'		:'uint16',
	'channel'	:'uint16',
	'is_attributed'	:'uint8',
	'click_id'	:'uint32',
	}

print('Loading train.csv...')

train_cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
train_df = pd.read_csv(path + 'train.csv', skiprows=range(1,64903891), nrows=120000000, dtype=dtypes, usecols=train_cols)
#train_df = pd.read_csv(path + 'train.csv', dtype=dtypes, usecols=train_cols)

print('Load test.csv...')
test_cols = ['ip', 'app', 'device', 'os', 'click_time', 'channel', 'click_id']
test_df = pd.read_csv(path + "test.csv", dtype=dtypes, usecols=test_cols)
test_supplement_df = pd.read_csv(path + "test_supplement.csv", dtype=dtypes, usecols=test_cols)

import gc

len_train = len(train_df)

print('Preprocessing...')

most_freq_hours_in_test_data = [4,5,9,10,13,14]
least_freq_hours_in_test_data = [6, 11, 15]

def add_counts(df, cols):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(axis=0)+1),
                                     return_inverse=True, return_counts=True)
    df["_".join(cols)+"_count"] = counts[unqtags]

def add_next_click(df):
    D = 2**26
    df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                      + "_" + df['os'].astype(str)).apply(hash) % D
    click_buffer = np.full(D, 3000000000, dtype=np.uint32)
    df['epochtime'] = df['click_time'].astype(np.int64) // 10 ** 9
    next_clicks = []
    for category, time in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
        next_clicks.append(click_buffer[category] - time)
        click_buffer[category] = time
    del click_buffer
    df['next_click'] = list(reversed(next_clicks))
    df.drop(['category', 'epochtime'], axis=1, inplace=True)
    
## Below a function is written to extract count feature by aggregating different cols
def do_count( df, group_cols, agg_type='uint32', show_max=False, show_agg=True ):
    agg_name='{}count'.format('_'.join(group_cols))  
    if show_agg:
        print( "\nAggregating by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )

#### Extracting next click feature 
    ### Taken help from https://www.kaggle.com/nanomathias/feature-engineering-importance-testing
    ###Did some Cosmetic changes 
predictors=[]
def do_next_Click( df,agg_suffix='nextClick', agg_type='float32'):
    
    print(f">> \nExtracting {agg_suffix} time calculation features...\n")
    
    GROUP_BY_NEXT_CLICKS = [
    
    # V1
    # {'groupby': ['ip']},
    # {'groupby': ['ip', 'app']},
    # {'groupby': ['ip', 'channel']},
    # {'groupby': ['ip', 'os']},
    
    # V3
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
    ]

    # Calculate the time to next click for each group
    for spec in GROUP_BY_NEXT_CLICKS:
    
       # Name of new feature
        new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)    
    
        # Unique list of features to select
        all_features = spec['groupby'] + ['click_time']

        # Run calculation
        print(f">> Grouping by {spec['groupby']}, and saving time to {agg_suffix} in: {new_feature}")
        df[new_feature] = (df[all_features].groupby(spec[
            'groupby']).click_time.shift(-1) - df.click_time).dt.seconds.astype(agg_type)
        
        predictors.append(new_feature)
        gc.collect()
    return (df)

def do_prev_Click( df,agg_suffix='prevClick', agg_type='float32'):

    print(f">> \nExtracting {agg_suffix} time calculation features...\n")
    
    GROUP_BY_NEXT_CLICKS = [
    
    # V1
    # {'groupby': ['ip']},
    # {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
    {'groupby': ['ip', 'os']},
    
    # V3
    #{'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    #{'groupby': ['ip', 'os', 'device']},
    #{'groupby': ['ip', 'os', 'device', 'app']}
    ]

    # Calculate the time to next click for each group
    for spec in GROUP_BY_NEXT_CLICKS:
    
       # Name of new feature
        new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)    
    
        # Unique list of features to select
        all_features = spec['groupby'] + ['click_time']

        # Run calculation
        print(f">> Grouping by {spec['groupby']}, and saving time to {agg_suffix} in: {new_feature}")
        df[new_feature] = (df.click_time - df[all_features].groupby(spec[
                'groupby']).click_time.shift(+1) ).dt.seconds.astype(agg_type)
        
        predictors.append(new_feature)
        gc.collect()
    return (df)    

##  Below a function is written to extract unique count feature from different cols
def do_countuniq( df, group_cols, counted, agg_type='uint32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_countuniq'.format(('_'.join(group_cols)),(counted))  
    if show_agg:
        print( "\nCounting unqiue ", counted, " by ", group_cols ,  '... and saved in', agg_name )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )
### Below a function is written to extract cumulative count feature  from different cols    
def do_cumcount( df, group_cols, counted,agg_type='uint32', show_max=False, show_agg=True ):
    agg_name= '{}_by_{}_cumcount'.format(('_'.join(group_cols)),(counted)) 
    if show_agg:
        print( "\nCumulative count by ", group_cols , '... and saved in', agg_name  )
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name]=gp.values
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type)
    predictors.append(agg_name)
#     print('predictors',predictors)
    gc.collect()
    return( df )
### Below a function is written to extract mean feature  from different cols



def preproc_data(df):
    
    #Extrace date info
    df['click_time']= pd.to_datetime(df['click_time'])
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
#     df['day'] = df['click_time'].dt.day.astype('uint8')
#     df['wday'] = df['click_time'].dt.dayofweek.astype('uint8')
    gc.collect()

    #Groups
#     df['in_test_hh'] = ( 3
# 	    		 - 2 * df['hour'].isin( most_freq_hours_in_test_data )
# 			 - 1 * df['hour'].isin( least_freq_hours_in_test_data )).astype('uint8')

    print('Adding next_click...')
#     add_next_click(df)
    df = do_next_Click(df)
    
    print('Grouping...')
    
    add_counts(df, ['ip'])
    add_counts(df, ['os', 'device'])
    add_counts(df, ['os', 'app', 'channel'])

    add_counts(df, ['ip', 'device'])
    add_counts(df, ['app', 'channel'])

#     add_counts(df, ['ip', 'wday', 'in_test_hh'])
#     add_counts(df, ['ip', 'wday', 'hour'])
#     add_counts(df, ['ip', 'os', 'wday', 'hour'])
#     add_counts(df, ['ip', 'app', 'wday', 'hour'])
#     add_counts(df, ['ip', 'device', 'wday', 'hour'])
#     add_counts(df, ['ip', 'app', 'os'])
#     add_counts(df, ['wday', 'hour', 'app'])
    df = do_countuniq( df, ['ip'], 'channel' ) 
    gc.collect() 
    df = do_countuniq( df, ['ip', 'device', 'os'], 'app')
    gc.collect()
    df = do_countuniq( df, ['ip'], 'app')
    gc.collect() 
    df = do_countuniq( df, ['ip', 'app'], 'os') 
    gc.collect() 
    df = do_countuniq( df, ['ip'], 'device')
    gc.collect() 
    df = do_countuniq( df, ['app'], 'channel')
    gc.collect()

    add_counts(df, ['ip', 'hour']) 
    gc.collect()
    add_counts(df, ['ip', 'os', 'hour']) 
    gc.collect()
    add_counts(df, ['ip', 'app', 'hour']) 
    add_counts(df, ['ip', 'device', 'hour']) 
    gc.collect()
    add_counts(df, ['ip', 'app', 'os']) 
    add_counts(df, ['hour', 'app']) 
    add_counts(df, ['hour', 'channel'])
    gc.collect()
#     df.drop(['ip', 'click_time'], axis=1, inplace=True )
    gc.collect()


    return df

y = train_df.is_attributed.values

submit = pd.DataFrame()
submit['click_id'] = test_df['click_id']

train_len = len(train_df)
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
# train_df = pd.concat([train_df[common_cols], test_df[common_cols]])
train_df = pd.concat([train_df[common_cols], test_supplement_df[common_cols]])
train_df = preproc_data(train_df)

# test_df = train_df.iloc[train_len:]
test_supplement_df = train_df.iloc[train_len:]
train_df = train_df.iloc[:train_len]

gc.collect()

metrics = 'auc'
lgb_params = {
	'boosting_type': 'gbdt',
	'objective': 'binary',
	'metric': metrics,
	'learning_rate': .1,
	'num_leaves': 7,
	'max_depth': 4,
	'min_child_samples': 100,
	'max_bin': 100,
	'subsample': 0.7,
	'subsample_freq': 1,
	'colsample_bytree': 0.7,
	'min_child_weight': 0,
	'min_split_gain': 0,
	'nthread': 4,
	'verbose': 1,
	'scale_pos_weight': 99.7
	#'scale_pos_weight': 400
}

target = 'is_attributed'

inputs = list(set(train_df.columns) - set([target]) - set(['ip']) - set( ['click_time']))
cat_vars = ['app', 'device', 'os', 'channel', 'hour']

train_df, val_df = train_test_split(train_df, train_size=.95, shuffle=False)
y_train, y_val = train_test_split(y, train_size=.95, shuffle=False)

print('Train size:', len(train_df))
print('Valid size:', len(val_df))

gc.collect()

print('Training...')

num_boost_round=max_rounds
early_stopping_rounds=early_stop

xgtrain = lgb.Dataset(train_df[inputs].values, label=y_train,
		      feature_name=inputs,
		      categorical_feature=cat_vars)
del train_df
gc.collect()

xgvalid = lgb.Dataset(val_df[inputs].values, label=y_val,
		      feature_name=inputs,
		      categorical_feature=cat_vars)
del val_df
gc.collect()

evals_results = {}

model = lgb.train(lgb_params,
		  xgtrain,
		  valid_sets= [xgvalid],
		  valid_names=['valid'],
		  evals_result=evals_results,
		  num_boost_round=num_boost_round,
		  early_stopping_rounds=early_stopping_rounds,
		  verbose_eval=1,
		  feval=None)
n_estimators = model.best_iteration

print('\nModel Info:')
print('n_estimators:', n_estimators)
print(metrics+':', evals_results['valid'][metrics][n_estimators-1])

del xgvalid
del xgtrain
gc.collect()


print('Predicting...')
submit['is_attributed'] = model.predict(test_df[inputs],num_iteration=model.best_iteration)
test_supplement_df['is_attributed'] = model.predict(test_supplement_df[predictors], num_iteration=model.best_iteration)
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']
test_df['click_time'] = pd.to_datetime(test_df['click_time'])
test_df = test_df.merge(test_supplement_df[all_cols], how='left', on=join_cols)

test_df = test_df.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")

test_df[['click_id', 'is_attributed']].to_csv('subtalk_final.csv', index=False, float_format='%.9f')
model.save_model('model_final.txt')
# print('Creating:', output_file)
# submit.to_csv(output_file, index=False, float_format='%.9f')
print('Done!')

Loading train.csv...
Load test.csv...
Preprocessing...
Adding next_click...
>> 
Extracting nextClick time calculation features...

>> Grouping by ['ip', 'app', 'device', 'os', 'channel'], and saving time to nextClick in: ip_app_device_os_channel_nextClick
>> Grouping by ['ip', 'os', 'device'], and saving time to nextClick in: ip_os_device_nextClick
>> Grouping by ['ip', 'os', 'device', 'app'], and saving time to nextClick in: ip_os_device_app_nextClick
Grouping...

Counting unqiue  channel  by  ['ip'] ... and saved in ip_by_channel_countuniq

Counting unqiue  app  by  ['ip', 'device', 'os'] ... and saved in ip_device_os_by_app_countuniq

Counting unqiue  app  by  ['ip'] ... and saved in ip_by_app_countuniq

Counting unqiue  os  by  ['ip', 'app'] ... and saved in ip_app_by_os_countuniq

Counting unqiue  device  by  ['ip'] ... and saved in ip_by_device_countuniq

Counting unqiue  channel  by  ['app'] ... and saved in app_by_channel_countuniq




Train size: 11400000
Valid size: 600000
Training...




[1]	valid's auc: 0.938313
Training until validation scores don't improve for 50 rounds.
[2]	valid's auc: 0.96411
[3]	valid's auc: 0.967192
[4]	valid's auc: 0.96846
[5]	valid's auc: 0.968129
[6]	valid's auc: 0.968451
[7]	valid's auc: 0.969063
[8]	valid's auc: 0.969823
[9]	valid's auc: 0.970249
[10]	valid's auc: 0.970589
[11]	valid's auc: 0.970934
[12]	valid's auc: 0.971039
[13]	valid's auc: 0.971331
[14]	valid's auc: 0.971451
[15]	valid's auc: 0.971214
[16]	valid's auc: 0.971921
[17]	valid's auc: 0.972052
[18]	valid's auc: 0.972485
[19]	valid's auc: 0.972545
[20]	valid's auc: 0.973474
[21]	valid's auc: 0.974217
[22]	valid's auc: 0.974607
[23]	valid's auc: 0.97506
[24]	valid's auc: 0.975217
[25]	valid's auc: 0.975354
[26]	valid's auc: 0.975193
[27]	valid's auc: 0.975222
[28]	valid's auc: 0.975451
[29]	valid's auc: 0.975873
[30]	valid's auc: 0.975936
[31]	valid's auc: 0.975936
[32]	valid's auc: 0.976187
[33]	valid's auc: 0.97632
[34]	valid's auc: 0.976373
[35]	valid's auc: 0.976842
[36]	v

In [2]:
test_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,hour,ip_app_device_os_channel_nextClick,ip_os_device_nextClick,ip_os_device_app_nextClick,...,ip_app_by_os_countuniq,ip_by_device_countuniq,app_by_channel_countuniq,ip_hour_count,ip_os_hour_count,ip_app_hour_count,ip_device_hour_count,ip_app_os_count,hour_app_count,hour_channel_count
12000000,5744,9,1,3,107,2017-11-10 04:00:00,4,,6.0,,...,16,1,42,34,3,8,34,1,475418,214877
12000001,119901,9,1,3,466,2017-11-10 04:00:00,4,18377.0,10.0,399.0,...,29,8,42,403,17,50,400,6,475418,84232
12000002,72287,21,1,19,128,2017-11-10 04:00:00,4,88.0,66.0,88.0,...,22,8,7,229,69,13,229,31,77207,83619
12000003,78477,15,1,13,111,2017-11-10 04:00:00,4,3181.0,5.0,425.0,...,17,5,29,239,98,10,239,49,209436,14481
12000004,123080,12,1,13,328,2017-11-10 04:00:00,4,1208.0,4.0,1208.0,...,9,3,30,60,27,7,59,7,396028,38063
