In [47]:
import os
from dateutil.parser import parse as dateutil_parse
import time
import pandas as pd
import numpy as np
import graphlab as gl
from graphlab import SFrame

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.cross_validation import train_test_split,cross_val_score
from six.moves import cPickle as pickle

from mylib import myStandardScaler,process_order,process_traffic,get_order_group,get_traffic_group,XY_order_traffic, prediction2submit, \
    Search_Model, DISTRICTS, request_answer_count, XY_order_cluster, mymetrics

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'
SEARCH_PATH = 'clf_gl/'

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order_group = pd.read_pickle(CLEAN_PATH+'train_order_group.pickle')
test_order_group = pd.read_pickle(CLEAN_PATH+'test_order_group.pickle')
train_traffic_group = pd.read_pickle(CLEAN_PATH+'train_traffic_group.pickle')
test_traffic_group = pd.read_pickle(CLEAN_PATH+'test_traffic_group.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [49]:
ra_count = request_answer_count(train_order_group)
train_cluster = pd.concat([poi, ra_count], axis = 1)

In [50]:
label_range = 12
clf = KMeans(n_clusters=label_range, max_iter=100000)
s = clf.fit(train_cluster)

In [51]:
district_label = pd.DataFrame({'label':clf.labels_}, index = train_cluster.index)
clf.labels_

array([ 3,  3,  3,  4,  3,  3,  3,  3,  3,  4, 11,  3, 11, 11,  7,  3,  3,
        3,  1,  3,  3,  3,  3,  3,  7,  3,  3,  3,  8,  3,  3,  9,  3,  3,
        3,  3,  1,  3,  7,  3,  4, 10,  6,  7,  3,  3,  5,  7,  7,  3,  7,
        7,  3,  0,  3,  3,  2,  3,  3,  7,  3,  0,  3,  3,  0,  3])

In [52]:
train_slot = pd.Index(sorted(train_order_group.values()[0].index.unique()))
train_slot = pd.Index(filter(lambda x: x%1000 >4,train_slot))
test_slot = test_target['datetimeslot']

In [53]:
now = time.time()

train_XY_group = dict()
for label in range(label_range):
    train_XY_group[label] = XY_order_cluster(label, district_label,train_order_group,train_traffic_group,train_slot)
    train_XY_group[label][0] = train_XY_group[label][0].sort_index(axis=1)
test_XY_group = dict()
for district in DISTRICTS:
    test_XY_group[district] = XY_order_traffic(district,test_order_group,test_traffic_group,test_slot)
    test_XY_group[district][0] = test_XY_group[district][0].sort_index(axis=1)
for label in range(label_range):
    scaler = myStandardScaler()
    train_XY_group[label][0] = scaler.fit_transform(train_XY_group[label][0])
    for district in DISTRICTS:
        if(district_label.ix[district].values[0] == label and district in train_traffic_group):
            Xnumerical = test_XY_group[district][0][test_XY_group[district][0].columns[test_XY_group[district][0].dtypes != bool]]
            test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])    
stop = time.time()
print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)

Take 00:00:09


# 单独处理没有交通信息的那个地点

In [54]:
for district in DISTRICTS:
    if district not in train_traffic_group:
        train_XY_group[district] = XY_order_traffic(district,train_order_group,train_traffic_group,train_slot)
        scaler = myStandardScaler()
        train_XY_group[district][0] = scaler.fit_transform(train_XY_group[district][0])
        test_XY_group[district][0] = scaler.transform(test_XY_group[district][0])    

In [55]:
train_XY_group.keys()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 'c4ec24e0a58ebedaa1661e5c09e47bb5']

In [56]:
all_now = time.time()
test_prediction = dict()
for key in train_XY_group.keys():
    if key in DISTRICTS:
        now = time.time()
        print 'Searching %s...'%key
        trainSet = SFrame(train_XY_group[key][0])
        targetSet = SFrame(train_XY_group[key][1])
        trainSet = trainSet.add_columns(targetSet)
        model = gl.boosted_trees_regression.create(trainSet, target='X1', features=train_XY_group[key][0].columns,
                                          max_iterations=5000, validation_set='auto', max_depth=5, step_size=0.03, 
                                          min_loss_reduction=1.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, verbose=False, 
                                          random_seed=1, metric='auto')
        sPredict = model.predict(SFrame(test_XY_group[key][0]))
        myPredict = pd.Series(sPredict.to_numpy(), index = test_XY_group[key][0].index)
        test_prediction[key] = myPredict - test_XY_group[key][1].fillna(0)
        with open(SEARCH_PATH+'test_prediction_%s.pickle'%(key),'wb') as f:
            pickle.dump(test_prediction[key],f)
        print "The metrics:", mymetrics(pd.Series(model.predict(SFrame(train_XY_group[key][0])).to_numpy()), pd.Series(trainSet['X1'].to_numpy()))
        stop = time.time()
        print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)
    else:
        now = time.time()
        print 'Searching %d...'%key
        trainSet = SFrame(train_XY_group[key][0])
        targetSet = SFrame(train_XY_group[key][1])
        trainSet = trainSet.add_columns(targetSet)
        model = gl.boosted_trees_regression.create(trainSet, target='X1', features=train_XY_group[key][0].columns,
                                          max_iterations=5000, validation_set='auto', max_depth=5, step_size=0.03, 
                                          min_loss_reduction=1.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, verbose=False, 
                                          random_seed=1, metric='auto')
        for district in DISTRICTS:
            if(district_label.ix[district].values[0] == key and district in train_traffic_group): 
                sPredict = model.predict(SFrame(test_XY_group[district][0]))
                myPredict = pd.Series(sPredict.to_numpy(), index = test_XY_group[district][0].index)
                test_prediction[district] = myPredict - test_XY_group[district][1].fillna(0)
                with open(SEARCH_PATH+'test_prediction_%s.pickle'%(district),'wb') as f:
                    pickle.dump(test_prediction[district],f)
        print "The metrics:", mymetrics(pd.Series(model.predict(SFrame(train_XY_group[key][0])).to_numpy()), \
                                                pd.Series(trainSet['X1'].to_numpy()))
        stop = time.time()
        print 'Take %02d:%02d:%02d' % ((stop-now)/3600,(stop-now)/60,(stop-now)%60)
all_stop = time.time()
print 'Totally take %02d:%02d:%02d' % ((all_stop-all_now)/3600,(all_stop-all_now)/60,(all_stop-all_now)%60)

Searching 0...
The metrics: 0.469801871863
Take 00:01:49
Searching 1...
The metrics: 0.148137409084
Take 00:01:22
Searching 2...
The metrics: 0.073780754724
Take 00:00:45
Searching 3...
The metrics: 1.29889032318
Take 00:45:51
Searching 4...
The metrics: 0.27095566053
Take 00:01:50
Searching 5...
The metrics: 0.228791366806
Take 00:00:40
Searching 6...
The metrics: 0.132574374631
Take 00:00:40
Searching 7...
The metrics: 0.660470792026
Take 00:04:46
Searching 8...
The metrics: 0.125653856081
Take 00:00:43
Searching 9...
The metrics: 0.257804671502
Take 00:00:44
Searching 10...
The metrics: 0.397719063981
Take 00:00:42
Searching 11...
The metrics: 0.385025134996
Take 00:02:07
Searching c4ec24e0a58ebedaa1661e5c09e47bb5...
The metrics: 1.07140233709
Take 00:00:44
Totally take 01:62:49


In [57]:
submit = prediction2submit(test_prediction, cluster_map)

In [58]:
def ceil_gap(x):
    if(x < 1):
        res = 1
    else:
        res = x
    return res

In [59]:
newgap = submit['prediction'].apply(lambda x: ceil_gap(x))
submit['gap'] = newgap
submit = submit.drop('prediction', axis = 1)

In [61]:
submit.to_csv(PATH+'submit/cluster_rl_boosted_tree_12.csv',index=None,header=False)