In [197]:
import os
from dateutil.parser import parse as dateutil_parse
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.cross_validation import train_test_split,cross_val_score

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, metrics, DISTRICTS

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order = pd.read_pickle(CLEAN_PATH+'train_order.pickle')
test_order = pd.read_pickle(CLEAN_PATH+'test_order.pickle')
train_traffic = pd.read_pickle(CLEAN_PATH+'train_traffic.pickle')
test_traffic = pd.read_pickle(CLEAN_PATH+'test_traffic.pickle')
train_weather = pd.read_pickle(CLEAN_PATH+'train_weather.pickle')
test_weather = pd.read_pickle(CLEAN_PATH+'test_weather.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [5]:
train_order = process_order(train_order)
test_order = process_order(test_order)
train_traffic = process_traffic(train_traffic)
test_traffic = process_traffic(test_traffic)

In [28]:
train_order_group = get_order_group(train_order)
test_order_group = get_order_group(test_order)
train_traffic_group = get_traffic_group(train_traffic)
test_traffic_group = get_traffic_group(test_traffic)

In [156]:
train_slot = pd.Index(sorted(train_order['datetimeslot'].unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

In [157]:
tmp = XY_order_traffic('c4ec24e0a58ebedaa1661e5c09e47bb5',train_order_group,train_traffic_group,train_slot)
print tmp[0].shape
print tmp[1].shape

(2940, 204)
(2940L,)


In [152]:
tmp = XY_order_traffic('4725c39a5e5f4c188d382da3910b3f3f',test_order_group,test_traffic_group,test_slot)
print tmp[0].shape
print tmp[1].shape

(43, 216)
(43L,)


In [192]:
train_XY_group = dict()
for district in DISTRICTS:
    train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)

In [193]:
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)

In [194]:
train_XY_group.values()[0][1].isnull().sum()

0

In [195]:
for district in DISTRICTS:
    scaler = myStandardScaler()
    train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
    test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])

In [None]:
rfr_models = {district: RandomForestRegressor(100, max_depth=14) for district in DISTRICTS}
test_prediction = dict()
for district, model in rfr_models.items():
    model.fit(*train_XY_group[district])
    train_prediction = pd.Series(np.floor(model.predict(train_XY_group[district][0])),index=train_XY_group[district][1].index)
    test_prediction[district] = np.floor(model.predict(test_XY_group[district][0])) - test_XY_group[district][1].fillna(0)
    print district,'r2 score', model.score(*train_XY_group[district])
    print district,'metric', metrics(model.predict(train_XY_group[district][0]),train_prediction)

38d5ad2d22b61109fd8e7b43cd0e8901 r2 score 0.980448340084
38d5ad2d22b61109fd8e7b43cd0e8901 metric 0.167567164954
08f5b445ec6b29deba62e6fd8b0325a6 r2 score 0.782500009358
08f5b445ec6b29deba62e6fd8b0325a6 metric 0.910008773245
364bf755f9b270f0f9141d1a61de43ee r2 score 0.87857769232
364bf755f9b270f0f9141d1a61de43ee metric 0.393285438549
49ac89aa860c27e26c0836cb8dab2df2 r2 score 0.630291058383
49ac89aa860c27e26c0836cb8dab2df2 metric 0.9475257432
8bb37d24db1ad665e706c2655d9c4c72 r2 score 0.764883070798
8bb37d24db1ad665e706c2655d9c4c72 metric 0.564738345421
dd8d3b9665536d6e05b29c2648c0e69a r2 score 0.79049450771
dd8d3b9665536d6e05b29c2648c0e69a metric 0.691325251872
08232402614a9b48895cc3d0aeb0e9f2 r2 score 0.59539870614
08232402614a9b48895cc3d0aeb0e9f2 metric 0.952734821734
b702e920dcd2765e624dc1ce3a770512 r2 score 0.867620697189
b702e920dcd2765e624dc1ce3a770512 metric 0.36466319282
52d7b69796362a8ed1691a6cc02ddde4 r2 score 0.624243407443
52d7b69796362a8ed1691a6cc02ddde4 metric 0.89933351013

In [None]:
prediction2submit(test_prediction, cluster_map).to_csv(PATH+'submit/rf14_order_traffic.csv',header=False,index=None)