In [1]:
import os
from dateutil.parser import parse as dateutil_parse
import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_validation import train_test_split,cross_val_score
from six.moves import cPickle as pickle

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, \
    Search_Model, DISTRICTS, request_answer_count, XY_order_cluster

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'
SEARCH_PATH = 'clf_rf/'

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order_group = pd.read_pickle(CLEAN_PATH+'train_order_group.pickle')
test_order_group = pd.read_pickle(CLEAN_PATH+'test_order_group.pickle')
train_traffic_group = pd.read_pickle(CLEAN_PATH+'train_traffic_group.pickle')
test_traffic_group = pd.read_pickle(CLEAN_PATH+'test_traffic_group.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [3]:
len(train_traffic_group.keys())

65

In [4]:
ra_count = request_answer_count(train_order_group)
train_cluster = pd.concat([poi, ra_count], axis = 1)

In [5]:
label_range = 8
clf = KMeans(n_clusters=label_range, max_iter=100000)
s = clf.fit(train_cluster)

In [6]:
district_label = pd.DataFrame({'label':clf.labels_}, index = train_cluster.index)

In [7]:
train_slot = pd.Index(sorted(train_order_group.values()[0].index.unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

In [8]:
now = time.time()

train_XY_group = dict()
for label in range(label_range):
    train_XY_group[label] = XY_order_cluster(label, district_label,train_order_group,train_traffic_group,train_slot)
    train_XY_group[label][0] = train_XY_group[label][0].sort_index(axis=1)
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)
    test_XY_group[district][0] = test_XY_group[district][0].sort_index(axis=1)
for label in range(label_range):
    scaler = myStandardScaler()
    train_XY_group[label][0] = scaler.fit_transform(train_XY_group[label][0])
    for district in DISTRICTS:
        if(district_label.ix[district].values[0] == label and district in train_traffic_group):
            Xnumerical = test_XY_group[district][0][test_XY_group[district][0].columns[test_XY_group[district][0].dtypes != bool]]
            test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])    
stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:00:08


# 单独处理没有交通信息的那个地点

In [9]:
for district in DISTRICTS:
    if district not in train_traffic_group:
        train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)
        scaler = myStandardScaler()
        train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
        test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])    

In [None]:
train_XY_group.keys()

[0, 1, 2, 3, 4, 5, 6, 7, 'c4ec24e0a58ebedaa1661e5c09e47bb5']

# Cluster + Searcher + Random Forest

In [None]:
grid_params = {'n_estimators': [80] ,'max_depth': np.arange(10, 18), 'min_samples_leaf': [2, 6], 
                     'min_samples_split': [2, 6], 'max_features': ['log2', 'sqrt',None]}
search_models = {key: Search_Model(RandomForestRegressor) for key in train_XY_group.keys()}
test_prediction = dict()
for key, model in search_models.items()[::-1]:
    if len(os.listdir(SEARCH_PATH))==66:
        print 'We can together! :)'
        break
    if key in DISTRICTS:
        now = time.time()
        print 'Searching %s...'%key
        model.fit(grid_params,*train_XY_group[key])
        test_prediction[key] = model.predict(test_XY_group[key][0]) - test_XY_group[key][1].fillna(0)
        with open(SEARCH_PATH + 'test_prediction_%s.pickle'%(key),'wb') as f:
            pickle.dump(test_prediction[key],f)
        stop = time.time()
        print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)
    else:
        now = time.time()
        print 'Searching cluster %d...'%key
        model.fit(grid_params,*train_XY_group[key])
        for district in DISTRICTS:
            if(district_label.ix[district].values[0] == key): 
                test_prediction[district] = model.predict(test_XY_group[district][0]) - test_XY_group[district][1].fillna(0)
                with open(SEARCH_PATH + 'test_prediction_%s.pickle'%(district),'wb') as f:
                    pickle.dump(test_prediction[district],f)
        stop = time.time()
        print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Searching c4ec24e0a58ebedaa1661e5c09e47bb5...


In [None]:
submit = prediction2submit(test_prediction, cluster_map)

In [None]:
def ceil_gap(x):
    if(x < 1):
        res = 1
    else:
        res = x
    return res

In [None]:
newgap = submit['prediction'].apply(lambda x: ceil_gap(x))
submit['gap'] = newgap
submit = submit.drop('prediction', axis = 1)

In [None]:
submit.to_csv(PATH+'submit/cluster_searchrf_order_traffic_1.csv',index=None,header=False)