#### Data

In [69]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from time import time
from tqdm import tqdm_notebook as tqdm
import datetime

In [70]:
train = pd.read_csv('./train/train.csv')

In [71]:
view_log = pd.read_csv('./train/view_log.csv')

In [72]:
item_data = pd.read_csv('./train/item_data.csv')

In [73]:
test = pd.read_csv('./test/test.csv')

In [74]:
def label_encode(df_train,df_test):
    for col in list(df_train.columns):
        if col in ['impression_id','impression_time']:
            continue
        if df_train[col].dtype == 'object':
            try:
                lbenc = LabelEncoder()
                lbenc.fit(df_train[col])
                df_train[col] = pd.Series(lbenc.transform(df_train[col].values))
                df_test[col] = pd.Series(lbenc.transform(df_test[col].values))
            except:
                print(col)
                raise ValueError('Erro')
    return df_train, df_test

In [75]:
def find_time_of_day(x):
    y = int(x.split()[1].split(':')[0])
    if y >= 6 and y < 10:
        return 0
    elif y >= 10 and y < 14:
        return 1
    elif y >= 14 and y < 18:
        return 2
    elif y >= 18 and y < 22:
        return 3
    elif y >= 22 and y < 2:
        return 4
    else:
        return 5

In [76]:
def cleaning(df):
    df['month'] = df['impression_time'].apply(lambda x: int(x.split()[0].split('-')[1]))
    df['day'] = df['impression_time'].apply(lambda x: int(x.split()[0].split('-')[2]))
    df['time_of_day'] = df['impression_time'].apply(find_time_of_day)
    return df

In [77]:
train = cleaning(train)
test = cleaning(test)

In [78]:
train.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,month,day,time_of_day
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,11,15,5
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,11,15,5
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0,11,15,5
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0,11,15,5
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0,11,15,5


In [79]:
item_data.head()

Unnamed: 0,item_id,item_price,category_1,category_2,category_3,product_type
0,26880,4602,11,35,20,3040
1,54939,3513,12,57,85,6822
2,40383,825,17,8,279,1619
3,8777,2355,13,58,189,5264
4,113705,1267,17,39,151,10239


In [80]:
view_log.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


In [81]:
test.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,month,day,time_of_day
0,a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,latest,1,12,13,0
1,caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,latest,0,12,13,0
2,13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,latest,1,12,13,0
3,39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,latest,1,12,13,0
4,bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,latest,1,12,13,0


In [82]:
test.shape

(90675, 9)

In [83]:
merged_data = view_log.merge(item_data,how='left',on='item_id')

In [84]:
train.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,month,day,time_of_day
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,11,15,5
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,11,15,5
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0,11,15,5
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0,11,15,5
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0,11,15,5


In [85]:
merged_data.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type
0,2018-10-15 08:58:00,android,112333,4557,32970,54685.0,16.0,56.0,253.0,3184.0
1,2018-10-15 08:58:00,android,503590,74788,7640,1376.0,7.0,71.0,228.0,545.0
2,2018-10-15 08:58:00,android,573960,23628,128855,4544.0,4.0,38.0,62.0,5609.0
3,2018-10-15 08:58:00,android,121691,2430,12774,904.0,17.0,39.0,252.0,2740.0
4,2018-10-15 08:58:00,android,218564,19227,28296,2304.0,12.0,57.0,54.0,7422.0


In [86]:
gp = merged_data.groupby('user_id')

In [87]:
cat_1 = merged_data.groupby('user_id')['category_1'].agg(pd.Series.mode)

In [88]:
prod = merged_data.groupby('user_id')['product_type'].agg(pd.Series.mode)

In [93]:
train,test = label_encode(train,test)

In [94]:
def feat_engg(df_train, df_test):
    mean_ = pd.Series(merged_data.groupby('user_id')['item_price'].agg(np.mean))
    max_ = pd.Series(merged_data.groupby('user_id')['item_price'].agg(np.max))
    min_ = pd.Series(merged_data.groupby('user_id')['item_price'].agg(np.min))
    # mode_ = pd.DataFrame(merged_data.groupby('user_id')['device_type'].agg(pd.Series.mode))
    df_train = df_train.merge(mean_,how='left',on='user_id')
    df_train = df_train.merge(max_,how='left',on='user_id')
    df_train = df_train.merge(min_,how='left',on='user_id')
    # df_train = df_train.merge(mode_,how='left',on='user_id')
    df_test = df_test.merge(mean_,how='left',on='user_id')
    df_test = df_test.merge(max_,how='left',on='user_id')
    df_test = df_test.merge(min_,how='left',on='user_id')
    # df_test = df_test.merge(mode_,how='left',on='user_id')
    return df_train, df_test

In [95]:
train,test = feat_engg(train,test)

In [96]:
train.dtypes

impression_id       object
impression_time     object
user_id              int64
app_code             int64
os_version           int64
is_4G                int64
is_click             int64
month                int64
day                  int64
time_of_day          int64
item_price_x       float64
item_price_y       float64
item_price_x       float64
item_price_y       float64
item_price_x       float64
item_price_y       float64
dtype: object

In [98]:
for i in train.columns:
    print(type(train[i]), i)

<class 'pandas.core.series.Series'> impression_id
<class 'pandas.core.series.Series'> impression_time
<class 'pandas.core.series.Series'> user_id
<class 'pandas.core.series.Series'> app_code
<class 'pandas.core.series.Series'> os_version
<class 'pandas.core.series.Series'> is_4G
<class 'pandas.core.series.Series'> is_click
<class 'pandas.core.series.Series'> month
<class 'pandas.core.series.Series'> day
<class 'pandas.core.series.Series'> time_of_day
<class 'pandas.core.frame.DataFrame'> item_price_x
<class 'pandas.core.frame.DataFrame'> item_price_y
<class 'pandas.core.frame.DataFrame'> item_price_x
<class 'pandas.core.frame.DataFrame'> item_price_y
<class 'pandas.core.frame.DataFrame'> item_price_x
<class 'pandas.core.frame.DataFrame'> item_price_y


In [99]:
train.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,month,day,time_of_day,item_price_x,item_price_y,item_price_x.1,item_price_y.1,item_price_x.2,item_price_y.2
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,2,0,0,11,15,5,2350.0,2350.0,2350.0,2350.0,2350.0,2350.0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,1,1,1,11,15,5,4452.833333,14166.0,1024.0,4452.833333,14166.0,1024.0
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,0,1,0,11,15,5,1598.5,2224.0,973.0,1598.5,2224.0,973.0
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,1,1,0,11,15,5,9963.388889,93568.0,249.0,9963.388889,93568.0,249.0
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,1,0,0,11,15,5,8618.934783,117376.0,288.0,8618.934783,117376.0,288.0


In [100]:
train.shape

(237609, 16)

In [101]:
test.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,month,day,time_of_day,item_price_x,item_price_y,item_price_x.1,item_price_y.1,item_price_x.2,item_price_y.2
0,a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,1,1,12,13,0,23424.0,23424.0,23424.0,23424.0,23424.0,23424.0
1,caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,1,0,12,13,0,10752.375,54681.0,207.0,10752.375,54681.0,207.0
2,13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,1,1,12,13,0,4974.555556,20309.0,1238.0,4974.555556,20309.0,1238.0
3,39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,1,1,12,13,0,15009.297297,62976.0,249.0,15009.297297,62976.0,249.0
4,bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,1,1,12,13,0,8778.285714,43392.0,435.0,8778.285714,43392.0,435.0


In [102]:
test.shape

(90675, 15)

In [103]:
train.drop(['impression_id','impression_time'],axis=1,inplace=True)
test_impression_id = test.impression_id.values
test.drop(['impression_id','impression_time'],axis=1,inplace=True)

In [104]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237609 entries, 0 to 237608
Data columns (total 14 columns):
user_id         237609 non-null int64
app_code        237609 non-null int64
os_version      237609 non-null int64
is_4G           237609 non-null int64
is_click        237609 non-null int64
month           237609 non-null int64
day             237609 non-null int64
time_of_day     237609 non-null int64
item_price_x    237606 non-null float64
item_price_y    237606 non-null float64
item_price_x    237606 non-null float64
item_price_y    237606 non-null float64
item_price_x    237606 non-null float64
item_price_y    237606 non-null float64
dtypes: float64(6), int64(8)
memory usage: 27.2 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 90675 entries, 0 to 90674
Data columns (total 13 columns):
user_id         90675 non-null int64
app_code        90675 non-null int64
os_version      90675 non-null int64
is_4G           90675 non-null int64
month           90675 non-null i

#### Model

In [105]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(['is_click'],axis=1).values,
                                                train.is_click.values,
                                                test_size = 0.5,
                                                random_state=42,
                                                shuffle=True)

In [106]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((118804, 13), (118805, 13), (118804,), (118805,))

In [107]:
values,counts = np.unique(y_train, return_counts=True)
print('\ty_train:\n',list(zip(values,counts)))
values,counts = np.unique(y_val, return_counts=True)
print('\ty_val:\n',list(zip(values,counts)))

	y_train:
 [(0, 113389), (1, 5415)]
	y_val:
 [(0, 113358), (1, 5447)]


In [132]:
_start = time()
d_train = lgb.Dataset(X_train, label=y_train)
d_val = lgb.Dataset(X_val, label=y_val)
# d_train = lgb.Dataset(train.drop(['is_click'],axis=1).values, label=train.is_click.values)
params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'auc'
params['sub_feature'] = 0.8
params['num_leaves'] = 50
params['min_data'] = 50
params['max_depth'] = 15
params['bagging_fraction'] = 0.7
params['bagging_freq'] = 2
params['max_bin'] = 64
model = lgb.train(params, train_set=d_train, valid_sets=d_val, num_boost_round=300)
#model = lgb.train(params, train_set=d_train, num_boost_round=300)
print('Training took {} sec'.format(time()-_start))

[1]	valid_0's auc: 0.562911
[2]	valid_0's auc: 0.566059
[3]	valid_0's auc: 0.657385
[4]	valid_0's auc: 0.649115
[5]	valid_0's auc: 0.676165
[6]	valid_0's auc: 0.672175
[7]	valid_0's auc: 0.68233
[8]	valid_0's auc: 0.687044
[9]	valid_0's auc: 0.690604
[10]	valid_0's auc: 0.689332
[11]	valid_0's auc: 0.693385
[12]	valid_0's auc: 0.695052
[13]	valid_0's auc: 0.696996
[14]	valid_0's auc: 0.698271
[15]	valid_0's auc: 0.69811
[16]	valid_0's auc: 0.69851
[17]	valid_0's auc: 0.698794
[18]	valid_0's auc: 0.698688
[19]	valid_0's auc: 0.698531
[20]	valid_0's auc: 0.699347
[21]	valid_0's auc: 0.69942
[22]	valid_0's auc: 0.700197
[23]	valid_0's auc: 0.700661
[24]	valid_0's auc: 0.700889
[25]	valid_0's auc: 0.70104
[26]	valid_0's auc: 0.701433
[27]	valid_0's auc: 0.701509
[28]	valid_0's auc: 0.701032
[29]	valid_0's auc: 0.701376
[30]	valid_0's auc: 0.701696
[31]	valid_0's auc: 0.70186
[32]	valid_0's auc: 0.702014
[33]	valid_0's auc: 0.70206
[34]	valid_0's auc: 0.702022
[35]	valid_0's auc: 0.701893
[

[286]	valid_0's auc: 0.704835
[287]	valid_0's auc: 0.704801
[288]	valid_0's auc: 0.704864
[289]	valid_0's auc: 0.704907
[290]	valid_0's auc: 0.704961
[291]	valid_0's auc: 0.704896
[292]	valid_0's auc: 0.704806
[293]	valid_0's auc: 0.704797
[294]	valid_0's auc: 0.704779
[295]	valid_0's auc: 0.704777
[296]	valid_0's auc: 0.704893
[297]	valid_0's auc: 0.704855
[298]	valid_0's auc: 0.70484
[299]	valid_0's auc: 0.704775
[300]	valid_0's auc: 0.704732
Training took 6.435048341751099 sec


In [133]:
print(roc_auc_score(y_train,model.predict(X_train)))

0.890364068937396


In [134]:
print(roc_auc_score(y_val,model.predict(X_val)))

0.7047318141177706


In [121]:
submission = pd.DataFrame()

In [122]:
submission['impression_id'] = test_impression_id

In [123]:
submission['is_click'] = model.predict(test.values)

In [124]:
submission.to_csv('./overfit(?).csv',index=False)