In [1]:
import os
from dateutil.parser import parse as dateutil_parse
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.cross_validation import train_test_split,cross_val_score

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'

%matplotlib inline

In [2]:
class myStandardScaler(skStandardScaler):
    '''
    Only use fit_transform and tranform, specific for dealing with pd.DataFrame.
    Only scale the numerical features.
    '''

    def fit_transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).fit_transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

    def transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

In [3]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order = pd.read_pickle(CLEAN_PATH+'train_order.pickle')
test_order = pd.read_pickle(CLEAN_PATH+'test_order.pickle')
train_traffic = pd.read_pickle(CLEAN_PATH+'train_traffic.pickle')
test_traffic = pd.read_pickle(CLEAN_PATH+'test_traffic.pickle')
train_weather = pd.read_pickle(CLEAN_PATH+'train_weather.pickle')
test_weather = pd.read_pickle(CLEAN_PATH+'test_weather.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [4]:
train_order_index = pd.Series(range(len(train_order)),index=train_order['Time'])
test_order_index = pd.Series(range(len(test_order)),index=test_order['Time'])
train_traffic_index = pd.Series(range(len(train_traffic)),index=train_traffic['datetime'])
test_traffic_index = pd.Series(range(len(test_traffic)),index=test_traffic['datetime'])

In [5]:
def process_order(order):
    def pclass(p):
        class_set = range(5,20,5)+range(20,100,10)+range(100,501,100)
        idx_set = [p>cls for cls in class_set]
        return idx_set.index(False) if sum(idx_set)!=len(idx_set) else len(idx_set)
    order['timeslot'] = order['Time'].map(lambda x: (x.hour*60+x.minute)/10+1)
    order['datetimeslot'] = order['Time'].map(lambda x: x.year*10000+x.month*100+x.day)*1000+order['timeslot']
    order['pclass'] = order['Price'].map(lambda x: pclass(x))
    order = pd.concat([order,pd.get_dummies(order['pclass'],'pclass').applymap(lambda x: {1.0: True, 0.0: False}[x])],axis=1)
    return order

In [6]:
train_order = process_order(train_order)
test_order = process_order(test_order)

In [8]:
def map_group(group):
    res = pd.Series()
    res['request'] = group['request'].count()
    res['answer'] = group['request'].sum()
    res['price_avg'] = group['Price'].mean()
    pclass_cols = filter(lambda x: x[:7]=='pclass_',group.columns)
    pclass_values = group[pclass_cols].sum()
    res = pd.concat([res,pclass_values])
    return res

In [13]:
districts = train_order['start_district_hash'].unique()

In [14]:
set(districts) == set(test_order['start_district_hash'].unique())

True

In [16]:
def get_order_group(order):
    order_group = dict()
    for district in districts:
        tmp = order[order['start_district_hash']==district]
        order_group[district] = tmp.groupby('datetimeslot').apply(lambda g: map_group(g))      
    return order_group

In [17]:
train_order_group = get_order_group(train_order)

In [18]:
test_order_group = get_order_group(test_order)

In [31]:
def train_test_order(order, dts):
    '''
    This funciton will not consider the first 3 time slot, for all the days. 
    It could be changed when test data set changes.
    '''
    timeslot = pd.get_dummies(dts.map(lambda x: int(x%1000)),'timeslot')
    #     target_timeslot = ['timeslot_'+str(i+1) for i in range(144)]
    target_timeslot = ['timeslot_'+str(i+1) for i in range(144)]
    timeslot = pd.DataFrame(timeslot, columns=target_timeslot).fillna(0)
    timeslot.index = dts
    timeslot = timeslot.applymap(lambda x: {1.0: True, 0.0: False}[x])
    train = pd.concat([order.ix[dts-1].rename(columns=lambda x: '1_'+x,index=lambda x: x+1),
                       order.ix[dts-2].rename(columns=lambda x: '2_'+x,index=lambda x: x+2),
                       order.ix[dts-3].rename(columns=lambda x: '3_'+x,index=lambda x: x+3),
                       timeslot], axis=1)
    train = train.dropna()
    test = (order['request'].ix[train.index]-order['answer'].ix[train.index])
    return [train,test]

In [32]:
train_XYgroup = dict()
for district in train_order_group:
    train_XYgroup[district] = train_test_order(train_order_group[district],train_order_group[district].index)

In [33]:
test_XYgroup = dict()
for district in test_order_group:
    test_XYgroup[district] = train_test_order(test_order_group[district],test_target['datetimeslot'])

In [34]:
for district in train_order_group:
    sample_scaler = myStandardScaler()
    train_XYgroup[district][0] = sample_scaler.fit_transform(train_XYgroup[district][0])
    test_XYgroup[district][0] = sample_scaler.transform(test_XYgroup[district][0])

In [36]:
rfr_models = {district: RandomForestRegressor(50, max_depth=12) for district in districts}

In [37]:
def evaluate(target, predict):
    return ((target-predict).abs()/target).replace(np.inf,0).mean()

In [39]:
test_prediction = dict()
for district, model in rfr_models.items():
    model.fit(*train_XYgroup[district])
    train_prediction = pd.Series(np.floor(model.predict(train_XYgroup[district][0])),index=train_XYgroup[district][1].index)
    test_prediction[district] = np.floor(model.predict(test_XYgroup[district][0])) - test_XYgroup[district][1].fillna(0)
    print district,'r2 score', model.score(*train_XYgroup[district])
    print district,'metric', evaluate(model.predict(train_XYgroup[district][0]),train_prediction)

38d5ad2d22b61109fd8e7b43cd0e8901 r2 score 0.976206929144
38d5ad2d22b61109fd8e7b43cd0e8901 metric 0.16656541348
08f5b445ec6b29deba62e6fd8b0325a6 r2 score 0.772898555692
08f5b445ec6b29deba62e6fd8b0325a6 metric 0.7344593218
364bf755f9b270f0f9141d1a61de43ee r2 score 0.852443876634
364bf755f9b270f0f9141d1a61de43ee metric 0.360543876501
49ac89aa860c27e26c0836cb8dab2df2 r2 score 0.57171773452
49ac89aa860c27e26c0836cb8dab2df2 metric 0.868592688746
8bb37d24db1ad665e706c2655d9c4c72 r2 score 0.717480497063
8bb37d24db1ad665e706c2655d9c4c72 metric 0.417789602114
08232402614a9b48895cc3d0aeb0e9f2 r2 score 0.580240614214
08232402614a9b48895cc3d0aeb0e9f2 metric 0.901099510105
b702e920dcd2765e624dc1ce3a770512 r2 score 0.81876105287
b702e920dcd2765e624dc1ce3a770512 metric 0.254310995455
52d7b69796362a8ed1691a6cc02ddde4 r2 score 0.478174339602
52d7b69796362a8ed1691a6cc02ddde4 metric 0.898683187479
62afaf3288e236b389af9cfdc5206415 r2 score 0.96693969669
62afaf3288e236b389af9cfdc5206415 metric 0.06179650498

In [44]:
pd.DataFrame({'prediction':test_prediction['d524868ce69cb9db10fc5af177fb9423'],'district':1})

Unnamed: 0_level_0,district,prediction
datetimeslot,Unnamed: 1_level_1,Unnamed: 2_level_1
20160122046,1,5.0
20160122058,1,0.0
20160122082,1,0.0
20160122118,1,0.0
20160122130,1,0.0
20160122142,1,0.0
20160124070,1,0.0
20160124082,1,0.0
20160124106,1,0.0
20160124118,1,0.0


In [97]:
def prediction2submit(prediction):
    res = []
    for district in prediction:
        res.append(pd.DataFrame({'prediction':test_prediction[district],'district':district}))
    res_1 = pd.concat(res)
#     res_1['district'] = res_1['district'].map(lambda x: cluster_map.ix[x,0])
    res_2 = pd.DataFrame()
    res_2['district'] = res_1['district'].map(lambda x: cluster_map.loc[x,'district_id'])
    res_2['dts'] = res_1.index.map(lambda x: '{0}-{1}'.format(dateutil_parse(str(x/1000)).date(),x%1000))
    res_2 ['prediction'] = res_1['prediction']
    res_2['dts_sort'] = res_2.index
    return res_2.sort_values(['dts_sort','district']).drop('dts_sort',axis=1)

In [98]:
prediction2submit(test_prediction).to_csv(PATH+'submit/order_rf.csv',header=False,index=None)