In [1]:
import gc
import json
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.cluster import KMeans
from chinese_calendar import is_holiday
from tqdm import tqdm_notebook
from utils import FlattenDataSimple, loadJSON, loadpkl

%matplotlib inline



In [2]:
# load csv
profiles = pd.read_csv('../input/data_set_phase2/profiles.csv')
test_plans = pd.read_csv('../input/data_set_phase2/train_plans_phase2.csv')
test_queries = pd.read_csv('../input/data_set_phase2/train_queries_phase2.csv')
train_clicks = pd.read_csv('../input/data_set_phase2/train_clicks_phase2.csv')
train_plans = pd.read_csv('../input/data_set_phase2/train_plans_phase2.csv')
train_queries = pd.read_csv('../input/data_set_phase2/train_queries_phase2.csv')

In [3]:
# merge click
train_plans = pd.merge(train_plans, train_clicks[['sid','click_mode']], on='sid', how='left')

# fill na (no click)
train_plans['click_mode'].fillna(0, inplace=True)

# set test target as nan
test_plans['click_mode'] = np.nan

# merge train & test
plans = train_plans.append(test_plans)

del train_plans, test_plans
gc.collect()

# reset index
plans.reset_index(inplace=True,drop=True)

# convert json
for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode']):
    plans[key] = plans.plans.apply(lambda x: loadJSON(x,key))

# flatten
plans_df = [FlattenDataSimple(plans, key) for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode'])]
plans_df = pd.concat(plans_df,axis=1)

# merge plan_time & click_mode
plans_df = pd.merge(plans_df.reset_index(), plans[['sid','plan_time', 'click_mode']], on='sid',how='outer')

# cleaning
for c in plans_df.columns.to_list():
    if 'price' in c:
        plans_df[c] = plans_df[c].replace('',0)

plans_df['plan_time'] = pd.to_datetime(plans_df['plan_time'])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [6]:
# test submission file
test_sub= pd.DataFrame()
test_sub['sid'] = plans_df['sid'][plans_df.click_mode.isnull()]
test_sub['recommend_mode'] = plans_df.plan_0_transport_mode[plans_df.click_mode.isnull()]

In [8]:
test_sub.to_csv('../output/submission_test.csv', index=False)

In [12]:
test_sub

Unnamed: 0,sid,recommend_mode
1,955169,7
3,955169,7
5,1262759,1
7,1262759,1
9,1579883,5
11,1579883,5
13,739077,7
15,739077,7
17,1048542,5
19,1048542,5


In [14]:
plans_df

Unnamed: 0,sid,plan_0_distance,plan_1_distance,plan_2_distance,plan_3_distance,plan_4_distance,plan_5_distance,plan_6_distance,plan_7_distance,plan_0_price,...,plan_0_transport_mode,plan_1_transport_mode,plan_2_transport_mode,plan_3_transport_mode,plan_4_transport_mode,plan_5_transport_mode,plan_6_transport_mode,plan_7_transport_mode,plan_time,click_mode
0,955169,10148,9663.0,9663.0,9809.0,9774.0,9421.0,,,600,...,7,3.0,4.0,9.0,6.0,1.0,,,2018-10-20 20:23:55,7.0
1,955169,10148,9663.0,9663.0,9809.0,9774.0,9421.0,,,600,...,7,3.0,4.0,9.0,6.0,1.0,,,2018-10-20 20:23:55,
2,955169,10148,9663.0,9663.0,9809.0,9774.0,9421.0,,,600,...,7,3.0,4.0,9.0,6.0,1.0,,,2018-10-20 20:23:55,7.0
3,955169,10148,9663.0,9663.0,9809.0,9774.0,9421.0,,,600,...,7,3.0,4.0,9.0,6.0,1.0,,,2018-10-20 20:23:55,
4,1262759,11372,11738.0,11738.0,,,,,,200,...,1,3.0,4.0,,,,,,2018-11-09 13:16:11,1.0
5,1262759,11372,11738.0,11738.0,,,,,,200,...,1,3.0,4.0,,,,,,2018-11-09 13:16:11,
6,1262759,11372,11738.0,11738.0,,,,,,200,...,1,3.0,4.0,,,,,,2018-11-09 13:16:11,1.0
7,1262759,11372,11738.0,11738.0,,,,,,200,...,1,3.0,4.0,,,,,,2018-11-09 13:16:11,
8,1579883,823,908.0,914.0,952.0,1601.0,,,,0,...,5,6.0,3.0,1.0,1.0,,,,2018-10-27 22:19:21,0.0
9,1579883,823,908.0,914.0,952.0,1601.0,,,,0,...,5,6.0,3.0,1.0,1.0,,,,2018-10-27 22:19:21,
