In [1]:
import os
from dateutil.parser import parse as dateutil_parse
import time
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.cross_validation import train_test_split,cross_val_score
from six.moves import cPickle as pickle

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, Search_Model, DISTRICTS

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order_group = pd.read_pickle(CLEAN_PATH+'train_order_group.pickle')
test_order_group = pd.read_pickle(CLEAN_PATH+'test_order_group.pickle')
train_traffic_group = pd.read_pickle(CLEAN_PATH+'train_traffic_group.pickle')
test_traffic_group = pd.read_pickle(CLEAN_PATH+'test_traffic_group.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [3]:
train_slot = pd.Index(sorted(train_order_group.values()[0].index.unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

In [4]:
train_slot = pd.Index(filter(lambda x: x / 1000 != 20160101,train_slot))
train_slot

Int64Index([20160102005, 20160102006, 20160102007, 20160102008, 20160102009,
            20160102010, 20160102011, 20160102012, 20160102013, 20160102014,
            ...
            20160121135, 20160121136, 20160121137, 20160121138, 20160121139,
            20160121140, 20160121141, 20160121142, 20160121143, 20160121144],
           dtype='int64', length=2791)

# delete 20160101, 20160116

In [5]:
for district in DISTRICTS:
    datetime_index = pd.Series(train_order_group[district].index, index = train_order_group[district].index)
    train_order_group[district] = train_order_group[district].drop(datetime_index.ix[20160101001:20160101144])
    train_order_group[district] = train_order_group[district].drop(datetime_index.ix[20160116001:20160116144])
train_slot = pd.Index(filter(lambda x: x / 1000 != 20160101,train_slot))
train_slot = pd.Index(filter(lambda x: x / 1000 != 20160116,train_slot))

In [6]:
for district in DISTRICTS:
    if(district != 'c4ec24e0a58ebedaa1661e5c09e47bb5'):
        datetime_index = pd.Series(train_traffic_group[district].index, index = train_traffic_group[district].index)
        train_traffic_group[district] = train_traffic_group[district].drop(datetime_index.ix[20160101001:20160101144])
        train_traffic_group[district] = train_traffic_group[district].drop(datetime_index.ix[20160116001:20160116144])

In [7]:
train_traffic_group

{'08232402614a9b48895cc3d0aeb0e9f2':               level_1  level_2  level_3  level_4
 datetimeslot                                    
 20160102002        95        4        8        1
 20160102003        94        2        4        1
 20160102004        98       12        3        4
 20160102005       129       20        4        8
 20160102006       108       24        3        1
 20160102007        51        1        0        3
 20160102008        68        6        0        3
 20160102009        58       10        5        0
 20160102010        43       15        2        4
 20160102011        57        9        2        2
 20160102012        29        6        8        2
 20160102013        42        4        2        1
 20160102014        62        8        7        2
 20160102015        76        2        3        4
 20160102016        77        3        0        6
 20160102017        71        2        0        6
 20160102018       106        5        2        1
 20160102019  

In [8]:
now = time.time()

train_XY_group = dict()
for district in DISTRICTS:
    train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)
for district in DISTRICTS:
    scaler = myStandardScaler()
    train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
    test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])
    
stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:00:15


In [9]:
grid_params = {'n_estimators': [80] ,'max_depth': np.arange(10, 18), 'min_samples_leaf': [2, 6], 
                     'min_samples_split': [2, 6], 'max_features': ['log2', 'sqrt',None]}
search_models = {district: Search_Model(RandomForestRegressor) for district in DISTRICTS}
test_prediction = dict()
for district, model in search_models.items()[::-1]:
    now = time.time()
    print 'Searching %s...'%district
    model.fit(grid_params,*train_XY_group[district])
    test_prediction[district] = model.predict(test_XY_group[district][0]) - test_XY_group[district][1].fillna(0)
    with open(CLEAN_PATH+'prediction/test_prediction_%s.pickle'%(district),'wb') as f:
        pickle.dump(test_prediction[district],f)
    stop = time.time()
    print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Searching 2350be163432e42270d2670cb3c02f80...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 6, 'n_estimators': 80, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'log2', 'max_depth': 10}
Fit score: 0.206789723019
The metrics: 0.181179850463
Take 00:04:00
Searching 4b7f6f4e2bf237b6cc58f57142bea5c0...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 6, 'n_estimators': 80, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'criterion': 'mse', 'random_state': None, 'max_features': 'log2', 'max_depth': 10}
Fit score: 0.196233353094
The metrics: 0.168569945089
Take 00:04:14
Searching 82cc4851f9e4faa4e54309f8bb73fd7c...
Best Params: {'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'verbose': 0, 'max_leaf_nodes': None, 'boo

In [11]:
submit = prediction2submit(test_prediction, cluster_map)

In [13]:
submit.to_csv(PATH+'submit/searchrf_order_traffic_delete1and14.csv',index=None,header=False)