In [1]:
import gc
import json
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.cluster import KMeans
from chinese_calendar import is_holiday
from tqdm import tqdm_notebook
from utils import FlattenDataSimple, loadJSON, loadpkl

%matplotlib inline



In [2]:
# load csv
profiles = pd.read_csv('../input/data_set_phase1/profiles.csv')
test_plans = pd.read_csv('../input/data_set_phase1/test_plans.csv')
test_queries = pd.read_csv('../input/data_set_phase1/test_queries.csv')
train_clicks = pd.read_csv('../input/data_set_phase1/train_clicks.csv')
train_plans = pd.read_csv('../input/data_set_phase1/train_plans.csv')
train_queries = pd.read_csv('../input/data_set_phase1/train_queries.csv')

In [3]:
# merge click
train_plans = pd.merge(train_plans, train_clicks[['sid','click_mode']], on='sid', how='left')

# fill na (no click)
train_plans['click_mode'].fillna(0, inplace=True)

# set test target as nan
test_plans['click_mode'] = np.nan

# merge train & test
plans = train_plans.append(test_plans)

del train_plans, test_plans
gc.collect()

# reset index
plans.reset_index(inplace=True,drop=True)

# convert json
for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode']):
    plans[key] = plans.plans.apply(lambda x: loadJSON(x,key))

# flatten
plans_df = [FlattenDataSimple(plans, key) for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode'])]
plans_df = pd.concat(plans_df,axis=1)

# merge plan_time & click_mode
plans_df = pd.merge(plans_df.reset_index(), plans[['sid','plan_time', 'click_mode']], on='sid',how='outer')

# cleaning
for c in plans_df.columns.to_list():
    if 'price' in c:
        plans_df[c] = plans_df[c].replace('',0)

plans_df['plan_time'] = pd.to_datetime(plans_df['plan_time'])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [4]:
cols_distance = ['plan_{}_distance'.format(i) for i in range(0,7)]
cols_price = ['plan_{}_price'.format(i) for i in range(0,7)]
cols_eta = ['plan_{}_eta'.format(i) for i in range(0,7)]
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]

In [5]:
plans_df.head()

Unnamed: 0,sid,plan_0_distance,plan_1_distance,plan_2_distance,plan_3_distance,plan_4_distance,plan_5_distance,plan_6_distance,plan_0_price,plan_1_price,...,plan_6_eta,plan_0_transport_mode,plan_1_transport_mode,plan_2_transport_mode,plan_3_transport_mode,plan_4_transport_mode,plan_5_transport_mode,plan_6_transport_mode,plan_time,click_mode
0,1709112,32303,33678.0,33678.0,32099.0,30446.0,,,600,0.0,...,,9,3.0,4.0,2.0,1.0,,,2018-11-04 11:45:04,9.0
1,3327773,3427,3251.0,3251.0,3227.0,3227.0,,,300,0.0,...,,2,3.0,4.0,6.0,5.0,,,2018-10-16 19:09:29,2.0
2,3285959,36234,29545.0,29545.0,33295.0,,,,1100,0.0,...,,7,3.0,4.0,1.0,,,,2018-11-25 15:05:22,7.0
3,1616170,11450,9100.0,10870.0,10870.0,11722.0,,,400,0.0,...,,2,6.0,3.0,4.0,7.0,,,2018-10-03 14:42:40,2.0
4,351369,18495,19111.0,19111.0,17117.0,,,,600,0.0,...,,7,3.0,4.0,1.0,,,,2018-10-05 19:52:37,4.0


In [6]:
profiles.head()

Unnamed: 0,pid,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65
0,196356,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,204083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,170667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,115511,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,129719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# merge click
train_queries = pd.merge(train_queries, train_clicks[['sid','click_mode']], on='sid', how='left')

# fill na (no click)
train_queries['click_mode'].fillna(0, inplace=True)

# set test target as nan
test_queries['click_mode'] = np.nan

# merge train & test
queries_df = train_queries.append(test_queries)

del train_queries, test_queries
gc.collect()

# to datetime
queries_df['req_time'] = pd.to_datetime(queries_df['req_time'])

# features distance
queries_df['x_o']=queries_df['o'].apply(lambda x: x.split(',')[0]).astype(float)
queries_df['y_o']=queries_df['o'].apply(lambda x: x.split(',')[1]).astype(float)
queries_df['x_d']=queries_df['d'].apply(lambda x: x.split(',')[0]).astype(float)
queries_df['y_d']=queries_df['d'].apply(lambda x: x.split(',')[1]).astype(float)

queries_df['queries_o_count']=queries_df['o'].map(queries_df['o'].value_counts())
queries_df['queries_d_count']=queries_df['d'].map(queries_df['d'].value_counts())

queries_df['queries_x_o_count']=queries_df['x_o'].map(queries_df['x_o'].value_counts())
queries_df['queries_y_o_count']=queries_df['y_o'].map(queries_df['y_o'].value_counts())
queries_df['queries_x_d_count']=queries_df['x_d'].map(queries_df['x_d'].value_counts())
queries_df['queries_y_d_count']=queries_df['y_d'].map(queries_df['y_d'].value_counts())

queries_df['queries_distance'] = np.sqrt((queries_df['x_o']-queries_df['x_d'])**2 + (queries_df['y_o']-queries_df['y_d'])**2)

queries_df['o_d'] = queries_df['o'].astype(str)+'_'+queries_df['d'].astype(str)
queries_df['queries_o_d_count'] = queries_df['o_d'].map(queries_df['o_d'].value_counts())

# datetime features
queries_df['queries_weekday'] = queries_df['req_time'].dt.weekday
queries_df['queries_hour'] = queries_df['req_time'].dt.hour
queries_df['queries_is_holiday'] = queries_df['req_time'].apply(lambda x: is_holiday(x)).astype(int)

queries_df['queries_weekday_count'] = queries_df['queries_weekday'].map(queries_df['queries_weekday'].value_counts())
queries_df['queries_hour_count'] = queries_df['queries_hour'].map(queries_df['queries_hour'].value_counts())

In [9]:
queries_df['o_d_is_holiday'] = queries_df['queries_is_holiday'].astype(str)+'_'+queries_df['o_d']
queries_df['o_d_weekday'] = queries_df['queries_weekday'].astype(str)+'_'+queries_df['o_d']
queries_df['o_d_hour'] = queries_df['queries_hour'].astype(str)+'_'+queries_df['o_d']

queries_df['o_is_holiday'] = queries_df['queries_is_holiday'].astype(str)+'_'+queries_df['o']
queries_df['o_weekday'] = queries_df['queries_weekday'].astype(str)+'_'+queries_df['o']
queries_df['o_hour'] = queries_df['queries_hour'].astype(str)+'_'+queries_df['o']

queries_df['d_is_holiday'] = queries_df['queries_is_holiday'].astype(str)+'_'+queries_df['d']
queries_df['d_weekday'] = queries_df['queries_weekday'].astype(str)+'_'+queries_df['d']
queries_df['d_hour'] = queries_df['queries_hour'].astype(str)+'_'+queries_df['d']

queries_df['queries_o_d_is_holiday_count'] = queries_df['o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(queries_df['o_d_weekday'].value_counts())
queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(queries_df['o_d_hour'].value_counts())

queries_df['queries_o_is_holiday_count'] = queries_df['o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
queries_df['queries_o_weekday_count'] = queries_df['o_d_weekday'].map(queries_df['o_d_weekday'].value_counts())
queries_df['queries_o_hour_count'] = queries_df['o_d_hour'].map(queries_df['o_d_hour'].value_counts())

queries_df['queries_o_d_is_holiday_count'] = queries_df['o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(queries_df['o_d_weekday'].value_counts())
queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(queries_df['o_d_hour'].value_counts())

In [5]:
plans_df

Unnamed: 0,sid,plan_0_distance,plan_1_distance,plan_2_distance,plan_3_distance,plan_4_distance,plan_5_distance,plan_6_distance,plan_0_price,plan_1_price,...,plan_6_eta,plan_0_transport_mode,plan_1_transport_mode,plan_2_transport_mode,plan_3_transport_mode,plan_4_transport_mode,plan_5_transport_mode,plan_6_transport_mode,plan_time,click_mode
0,1709112,32303,33678.0,33678.0,32099.0,30446.0,,,600,0.0,...,,9,3.0,4.0,2.0,1.0,,,2018-11-04 11:45:04,9.0
1,3327773,3427,3251.0,3251.0,3227.0,3227.0,,,300,0.0,...,,2,3.0,4.0,6.0,5.0,,,2018-10-16 19:09:29,2.0
2,3285959,36234,29545.0,29545.0,33295.0,,,,1100,0.0,...,,7,3.0,4.0,1.0,,,,2018-11-25 15:05:22,7.0
3,1616170,11450,9100.0,10870.0,10870.0,11722.0,,,400,0.0,...,,2,6.0,3.0,4.0,7.0,,,2018-10-03 14:42:40,2.0
4,351369,18495,19111.0,19111.0,17117.0,,,,600,0.0,...,,7,3.0,4.0,1.0,,,,2018-10-05 19:52:37,4.0
5,749203,32486,,,,,,,0,,...,,3,,,,,,,2018-10-01 07:03:52,3.0
6,3005789,4935,4434.0,4434.0,4228.0,3711.0,,,300,0.0,...,,2,3.0,4.0,6.0,5.0,,,2018-10-22 11:45:20,4.0
7,1897386,22103,18601.0,18601.0,24067.0,19733.0,,,500,0.0,...,,2,3.0,4.0,9.0,1.0,,,2018-11-13 18:51:27,2.0
8,845203,12422,12548.0,12548.0,12542.0,8641.0,13657.0,,400,0.0,...,,2,3.0,4.0,9.0,6.0,1.0,,2018-11-26 12:43:56,2.0
9,536341,2162,2647.0,2176.0,2647.0,2176.0,,,200,0.0,...,,1,3.0,6.0,4.0,5.0,,,2018-11-23 19:34:38,1.0


In [2]:
pred_lgbm = loadpkl('../features/lgbm_pred.pkl')

In [96]:
df = pd.merge(pred_lgbm, plans_df[cols_transport_mode+['sid']],on='sid', how='left')

In [10]:
cols_pred = ['pred_lgbm_plans{}'.format(i) for i in range(0,12)]

In [99]:
df['recommend_mode'] = np.argmax(df[cols_pred].values,axis=1)

In [44]:
for c in cols_transport_mode:
    print((train_df[c]==train_df['click_mode']).sum())

331782
27716
32869
56347
43406
11051
264


In [66]:
df.click_mode.value_counts()

2.0     136491
7.0      78209
1.0      70369
9.0      48864
5.0      47480
0.0      37718
3.0      24626
10.0     14882
4.0      12606
6.0      11863
11.0      6089
8.0       1857
Name: click_mode, dtype: int64

In [60]:
for p in cols_pred[1:]:
    for c in cols_transport_mode:
        

['pred_lgbm_plans1',
 'pred_lgbm_plans2',
 'pred_lgbm_plans3',
 'pred_lgbm_plans4',
 'pred_lgbm_plans5',
 'pred_lgbm_plans6',
 'pred_lgbm_plans7',
 'pred_lgbm_plans8',
 'pred_lgbm_plans9',
 'pred_lgbm_plans10',
 'pred_lgbm_plans11']

In [69]:
tmp = np.zeros(len(df))
for i in range(1,12):
    tmp += (df.plan_0_transport_mode==i).astype(int)

In [97]:
for i in range(1,12):
    tmp = np.zeros(len(df))
    for c in cols_transport_mode:
        tmp += (df[c]==i).astype(int)
    df['pred_lgbm_plans{}'.format(i)]=df['pred_lgbm_plans{}'.format(i)]*(tmp>0)

In [100]:
df

Unnamed: 0,pred_lgbm_plans0,pred_lgbm_plans1,pred_lgbm_plans2,pred_lgbm_plans3,pred_lgbm_plans4,pred_lgbm_plans5,pred_lgbm_plans6,pred_lgbm_plans7,pred_lgbm_plans8,pred_lgbm_plans9,...,sid,click_mode,plan_0_transport_mode,plan_1_transport_mode,plan_2_transport_mode,plan_3_transport_mode,plan_4_transport_mode,plan_5_transport_mode,plan_6_transport_mode,recommend_mode
0,0.078033,0.033743,0.150198,0.038961,0.012308,0.000000,0.000000,0.000000,0.000000,0.686472,...,1709112,9.0,9,3.0,4.0,2.0,1.0,,,9
1,0.023421,0.000000,0.944652,0.007875,0.005612,0.003620,0.008827,0.000000,0.000000,0.000000,...,3327773,2.0,2,3.0,4.0,6.0,5.0,,,2
2,0.068866,0.075084,0.000000,0.040265,0.020967,0.000000,0.000000,0.794640,0.000000,0.000000,...,3285959,7.0,7,3.0,4.0,1.0,,,,7
3,0.063395,0.000000,0.858796,0.010179,0.010284,0.000000,0.007450,0.049479,0.000000,0.000000,...,1616170,2.0,2,6.0,3.0,4.0,7.0,,,2
4,0.041466,0.091718,0.000000,0.025590,0.019135,0.000000,0.000000,0.821941,0.000000,0.000000,...,351369,4.0,7,3.0,4.0,1.0,,,,7
5,0.167772,0.000000,0.000000,0.828111,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,749203,3.0,3,,,,,,,3
6,0.036769,0.000000,0.912787,0.014090,0.012581,0.009169,0.008839,0.000000,0.000000,0.000000,...,3005789,4.0,2,3.0,4.0,6.0,5.0,,,2
7,0.066431,0.057497,0.767268,0.047226,0.020997,0.000000,0.000000,0.000000,0.000000,0.040297,...,1897386,2.0,2,3.0,4.0,9.0,1.0,,,2
8,0.036442,0.034136,0.848875,0.020425,0.016496,0.000000,0.004146,0.000000,0.000000,0.039224,...,845203,2.0,2,3.0,4.0,9.0,6.0,1.0,,2
9,0.070231,0.734768,0.000000,0.027899,0.015137,0.071332,0.079258,0.000000,0.000000,0.000000,...,536341,1.0,1,3.0,6.0,4.0,5.0,,,1


In [6]:
test_df=df[df.click_mode.isnull()]

NameError: name 'df' is not defined

In [105]:
test_df[['sid','recommend_mode']].to_csv('../output/submission_postprecessed.csv', index=False)

In [123]:
np.argmax(pd.DataFrame(np.zeros((df.shape[0],12))).values,axis=1)

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
pred_lgbm.click_mode.value_counts(normalize=True)

2.0     0.277955
7.0     0.159268
1.0     0.143302
9.0     0.099508
5.0     0.096690
0.0     0.076810
3.0     0.050149
10.0    0.030306
4.0     0.025671
6.0     0.024158
11.0    0.012400
8.0     0.003782
Name: click_mode, dtype: float64