In [1]:
import gc
import json
import pandas as pd
import numpy as np
import seaborn as sns

from tqdm import tqdm_notebook
from utils import FlattenDataSimple, loadJSON, loadpkl

%matplotlib inline



In [2]:
# load csv
profiles = pd.read_csv('../input/data_set_phase1/profiles.csv')
test_plans = pd.read_csv('../input/data_set_phase1/test_plans.csv')
test_queries = pd.read_csv('../input/data_set_phase1/test_queries.csv')
train_clicks = pd.read_csv('../input/data_set_phase1/train_clicks.csv')
train_plans = pd.read_csv('../input/data_set_phase1/train_plans.csv')
train_queries = pd.read_csv('../input/data_set_phase1/train_queries.csv')

In [3]:
# merge click
train_plans = pd.merge(train_plans, train_clicks[['sid','click_mode']], on='sid', how='left')

# fill na (no click)
train_plans['click_mode'].fillna(0, inplace=True)

# set test target as nan
test_plans['click_mode'] = np.nan

# merge train & test
plans = train_plans.append(test_plans)

del train_plans, test_plans
gc.collect()

# reset index
plans.reset_index(inplace=True,drop=True)

# convert json
for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode']):
    plans[key] = plans.plans.apply(lambda x: loadJSON(x,key))

# flatten
plans_df = [FlattenDataSimple(plans, key) for key in tqdm_notebook(['distance', 'price', 'eta', 'transport_mode'])]
plans_df = pd.concat(plans_df,axis=1)

# merge plan_time & click_mode
plans_df = pd.merge(plans_df.reset_index(), plans[['sid','plan_time', 'click_mode']], on='sid',how='outer')

# cleaning
for c in plans_df.columns.to_list():
    if 'price' in c:
        plans_df[c] = plans_df[c].replace('',0)

plans_df['plan_time'] = pd.to_datetime(plans_df['plan_time'])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [5]:
cols_distance = ['plan_{}_distance'.format(i) for i in range(0,7)]
cols_price = ['plan_{}_price'.format(i) for i in range(0,7)]
cols_eta = ['plan_{}_eta'.format(i) for i in range(0,7)]
cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]

In [4]:
plans_df.head()

Unnamed: 0,sid,plan_0_distance,plan_1_distance,plan_2_distance,plan_3_distance,plan_4_distance,plan_5_distance,plan_6_distance,plan_0_price,plan_1_price,...,plan_6_eta,plan_0_transport_mode,plan_1_transport_mode,plan_2_transport_mode,plan_3_transport_mode,plan_4_transport_mode,plan_5_transport_mode,plan_6_transport_mode,plan_time,click_mode
0,1709112,32303,33678.0,33678.0,32099.0,30446.0,,,600,0.0,...,,9,3.0,4.0,2.0,1.0,,,2018-11-04 11:45:04,9.0
1,3327773,3427,3251.0,3251.0,3227.0,3227.0,,,300,0.0,...,,2,3.0,4.0,6.0,5.0,,,2018-10-16 19:09:29,2.0
2,3285959,36234,29545.0,29545.0,33295.0,,,,1100,0.0,...,,7,3.0,4.0,1.0,,,,2018-11-25 15:05:22,7.0
3,1616170,11450,9100.0,10870.0,10870.0,11722.0,,,400,0.0,...,,2,6.0,3.0,4.0,7.0,,,2018-10-03 14:42:40,2.0
4,351369,18495,19111.0,19111.0,17117.0,,,,600,0.0,...,,7,3.0,4.0,1.0,,,,2018-10-05 19:52:37,4.0


In [6]:
profiles.head()

Unnamed: 0,pid,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65
0,196356,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,204083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,170667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,115511,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,129719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# merge train & test
queries_df = train_queries.append(test_queries)

del train_queries, test_queries
gc.collect()

# to datetime
queries_df['req_time'] = pd.to_datetime(queries_df['req_time'])

# features distance
queries_df['x_o']=queries_df['o'].apply(lambda x: x.split(',')[0]).astype(float)
queries_df['y_o']=queries_df['o'].apply(lambda x: x.split(',')[1]).astype(float)
queries_df['x_d']=queries_df['d'].apply(lambda x: x.split(',')[0]).astype(float)
queries_df['y_d']=queries_df['d'].apply(lambda x: x.split(',')[1]).astype(float)

queries_df['queries_distance'] = np.sqrt((queries_df['x_o']-queries_df['x_d'])**2 + (queries_df['y_o']-queries_df['y_d'])**2)

queries_df['queries_x_o_count']=queries_df['x_o'].map(queries_df['x_o'].value_counts())
queries_df['queries_y_o_count']=queries_df['y_o'].map(queries_df['y_o'].value_counts())
queries_df['queries_x_d_count']=queries_df['x_d'].map(queries_df['x_d'].value_counts())
queries_df['queries_y_d_count']=queries_df['y_d'].map(queries_df['y_d'].value_counts())

In [8]:
queries_df.head()

Unnamed: 0,sid,pid,req_time,o,d,x_o,y_o,x_d,y_d,queries_distance,queries_x_o_count,queries_y_o_count,queries_x_d_count,queries_y_d_count
0,3000821,,2018-11-02 17:54:30,"116.29,39.97","116.32,39.96",116.29,39.97,116.32,39.96,0.031623,15792,20412,37054,16937
1,3085857,210736.0,2018-11-16 10:53:10,"116.39,39.84","116.33,39.79",116.39,39.84,116.33,39.79,0.078102,33319,12712,20339,4843
2,2944522,,2018-10-06 10:33:58,"116.31,39.93","116.27,40.00",116.31,39.93,116.27,40.0,0.080623,27853,33989,14120,23584
3,559931,202427.0,2018-11-23 14:54:11,"116.27,39.88","116.39,39.90",116.27,39.88,116.39,39.9,0.121655,11881,15663,44759,46675
4,2819352,172251.0,2018-10-30 11:48:41,"116.34,39.96","116.37,39.86",116.34,39.96,116.37,39.86,0.104403,18655,17994,35540,25996


In [19]:

plans_price_df = plans_df[cols_price]
#plans_df[cols_price].idxmax(axis=1).index #.map(plans_df[cols_transport_mode])

In [21]:
plans_price_df.columns = [i for i in range(0,7)]

In [36]:
max_list =plans_price_df.idxmax(axis=1).to_list()

In [53]:
plans_df['plan_distance_max_plan'] = plans_df[cols_distance].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode')
plans_df['plan_distance_min_plan'] = plans_df[cols_distance].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode')
plans_df['plan_price_max_plan'] = plans_df[cols_price].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode')
plans_df['plan_price_min_plan'] = plans_df[cols_price].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode')
plans_df['plan_eta_max_plan'] = plans_df[cols_eta].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode')
plans_df['plan_eta_min_plan'] = plans_df[cols_eta].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode')

In [59]:
plans_df['plan_distance_max_plan'][plans_df['plan_distance_max_plan']=='plan_0_transport_mode'] = plans_df['plan_0_transport_mode'][plans_df['plan_distance_max_plan']=='plan_0_transport_mode']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
for i in range(1,7):
    plans_df['plan_{}_distance_ratio_0'.format(i)] = plans_df['plan_{}_distance'.format(i)]/plans_df['plan_0_distance']
    plans_df['plan_{}_price_ratio_0'.format(i)] = plans_df['plan_{}_price'.format(i)]/plans_df['plan_0_price']
    plans_df['plan_{}_eta_ratio_0'.format(i)] = plans_df['plan_{}_eta'.format(i)]/plans_df['plan_0_eta']

In [65]:
cols_distance_ratio_0 = ['plan_{}_distance_ratio_0'.format(i) for i in range(1,7)]
cols_price_ratio_0 = ['plan_{}_price_ratio_0'.format(i) for i in range(1,7)]
cols_eta_ratio_0 = ['plan_{}_eta_ratio_0'.format(i) for i in range(1,7)]

In [72]:
plans_df[cols_distance_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

0         plan_1_transport_mode
1         plan_1_transport_mode
2         plan_3_transport_mode
3         plan_4_transport_mode
4         plan_1_transport_mode
5                           NaN
6         plan_1_transport_mode
7         plan_3_transport_mode
8         plan_5_transport_mode
9         plan_1_transport_mode
10        plan_4_transport_mode
11        plan_2_transport_mode
12        plan_1_transport_mode
13        plan_4_transport_mode
14        plan_1_transport_mode
15        plan_1_transport_mode
16        plan_1_transport_mode
17        plan_3_transport_mode
18        plan_5_transport_mode
19        plan_3_transport_mode
20        plan_1_transport_mode
21        plan_4_transport_mode
22        plan_1_transport_mode
23        plan_3_transport_mode
24        plan_1_transport_mode
25        plan_2_transport_mode
26        plan_3_transport_mode
27        plan_5_transport_mode
28        plan_4_transport_mode
29        plan_4_transport_mode
                  ...          
583595  

In [71]:
type(np.nan)==float

True