In [77]:
import os
from dateutil.parser import parse as dateutil_parse
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler as skStandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.cross_validation import train_test_split,cross_val_score

PATH = 'season_1/'
CLEAN_PATH = PATH+'clean/'

%matplotlib inline

In [78]:
class myStandardScaler(skStandardScaler):
    '''
    Only use fit_transform and tranform, specific for dealing with pd.DataFrame.
    Only scale the numerical features.
    '''

    def fit_transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).fit_transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

    def transform(self, X):
        Xnumerical = X[X.columns[X.dtypes != bool]]
        Xdummy = X[X.columns[X.dtypes == bool]]
        scaledXnumerical = super(myStandardScaler, self).transform(Xnumerical)
        Xnumerical = pd.DataFrame(scaledXnumerical, index=Xnumerical.index, columns=Xnumerical.columns)
        return pd.concat([Xnumerical, Xdummy], axis=1)

In [79]:
cluster_map = pd.read_csv(CLEAN_PATH+'cluster_map.csv',index_col=0)
poi = pd.read_csv(CLEAN_PATH+'poi.csv',index_col=0)
train_order = pd.read_pickle(CLEAN_PATH+'train_order.pickle')
test_order = pd.read_pickle(CLEAN_PATH+'test_order.pickle')
train_traffic = pd.read_pickle(CLEAN_PATH+'train_traffic.pickle')
test_traffic = pd.read_pickle(CLEAN_PATH+'test_traffic.pickle')
train_weather = pd.read_pickle(CLEAN_PATH+'train_weather.pickle')
test_weather = pd.read_pickle(CLEAN_PATH+'test_weather.pickle')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)

In [80]:
test_target['datetimeslot'] = np.array(test_target.index.map(lambda x: x.year*10000+x.month*100+x.day),dtype=np.int64)*1000+test_target['timeslot'].values
test_target.to_csv(CLEAN_PATH+'test_target.csv')
test_target = pd.read_csv(CLEAN_PATH+'test_target.csv',index_col=0,parse_dates=True)
test_target.head()

Unnamed: 0_level_0,timeslot,datetimeslot
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-22,46,20160122046
2016-01-22,58,20160122058
2016-01-22,70,20160122070
2016-01-22,82,20160122082
2016-01-22,94,20160122094


In [81]:
train_order_index = pd.Series(range(len(train_order)),index=train_order['Time'])
test_order_index = pd.Series(range(len(test_order)),index=test_order['Time'])
train_traffic_index = pd.Series(range(len(train_traffic)),index=train_traffic['datetime'])
test_traffic_index = pd.Series(range(len(test_traffic)),index=test_traffic['datetime'])

In [82]:
def process_order(order):
    def pclass(p):
        class_set = range(5,20,5)+range(20,100,10)+range(100,501,100)
        idx_set = [p>cls for cls in class_set]
        return idx_set.index(False) if sum(idx_set)!=len(idx_set) else len(idx_set)
    order['timeslot'] = order['Time'].map(lambda x: (x.hour*60+x.minute)/10+1)
    order['datetimeslot'] = order['Time'].map(lambda x: x.year*10000+x.month*100+x.day)*1000+order['timeslot']
    order['pclass'] = order['Price'].map(lambda x: pclass(x))
    order = pd.concat([order,pd.get_dummies(order['pclass'],'pclass').applymap(lambda x: {1.0: True, 0.0: False}[x])],axis=1)
    return order

In [83]:
test_order = process_order(test_order)

In [84]:
train_order = process_order(train_order)

In [85]:
def map_group(group):
    res = pd.Series()
    res['request'] = group['request'].count()
    res['answer'] = group['request'].sum()
    res['price_avg'] = group['Price'].mean()
    pclass_cols = filter(lambda x: x[:7]=='pclass_',group.columns)
    pclass_values = group[pclass_cols].sum()
    res = pd.concat([res,pclass_values])
    return res

In [86]:
tmp = train_order[train_order['start_district_hash']=='b05379ac3f9b7d99370d443cfd5dcc28']

In [87]:
tmp_group = tmp.groupby('datetimeslot').apply(lambda g: map_group(g))

In [88]:
# test_order[test_order['start_district_hash']=='b05379ac3f9b7d99370d443cfd5dcc28']

In [89]:
test_tmp = test_order[test_order['start_district_hash']=='b05379ac3f9b7d99370d443cfd5dcc28'].groupby('datetimeslot').apply(lambda g: map_group(g))

In [90]:
def train_test_order(order, dts):
    '''
    This funciton will not consider the first 3 time slot, for all the days. 
    It could be change when test data set changes.
    '''
    timeslot = pd.get_dummies(dts.map(lambda x: int(x%1000)),'timeslot')
    timeslot = pd.DataFrame(timeslot, columns=['timeslot_'+str(i+1) for i in range(144)]).fillna(0)
    timeslot.index = dts
    timeslot = timeslot.applymap(lambda x: {1.0: True, 0.0: False}[x])
    train = pd.concat([order.ix[dts-1].rename(columns=lambda x: '1_'+x,index=lambda x: x+1),
                       order.ix[dts-2].rename(columns=lambda x: '2_'+x,index=lambda x: x+2),
                       order.ix[dts-3].rename(columns=lambda x: '3_'+x,index=lambda x: x+3),
                       timeslot], axis=1)
    train = train.dropna()
    test = (order['request'].ix[train.index]-order['answer'].ix[train.index])
    return train,test

In [91]:
sample_testX, sample_testY = train_test_order(test_tmp, test_target['datetimeslot'])

In [92]:
sample_trainX, sample_trainY = train_test_order(tmp_group, tmp_group.index)

In [93]:
sample_scaler = myStandardScaler()
sample_trainX = sample_scaler.fit_transform(sample_trainX)
sample_testX = sample_scaler.transform(sample_testX)

In [94]:
print sample_trainX.shape
print sample_testX.shape
print sample_trainY.shape
print sample_testY.shape

(2961, 204)
(43, 204)
(2961L,)
(43L,)


In [96]:
rfr.fit(sample_trainX,sample_trainY)
rfr.score(sample_trainX,sample_trainY)

0.98083052364416212

In [97]:
def evaluate(target, predict):
    return ((target-predict).abs()/target).replace(np.inf,0).mean()

In [102]:
prediction = pd.Series(rfr.predict(sample_trainX),index=sample_trainY.index)
evaluate(sample_trainY,prediction)

0.4581520586868448

In [115]:
prediction = pd.Series(np.floor(rfr.predict(sample_trainX)),index=sample_trainY.index)
# prediction[prediction<10]=0
evaluate(sample_trainY,prediction)

0.482533949623496

In [117]:
evaluate(sample_trainY,m)

0.5388705039514087

In [132]:
rfr = RandomForestRegressor(50, max_depth=None)

In [133]:
rfr.fit(sample_trainX,sample_trainY)
rfr.score(sample_trainX,sample_trainY)

0.98192688610110135

In [136]:
prediction = pd.Series(np.floor(rfr.predict(sample_trainX)),index=sample_trainY.index)
# prediction[prediction<3]=0
evaluate(sample_trainY,prediction)

0.3301811924264464

In [137]:
rfr = RandomForestRegressor(50, max_depth=None)

In [138]:
scores = cross_val_score(RandomForestRegressor(50, max_depth=None), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.83192855  0.83665273  0.81880207]
0.829127786867
0.00755180849321


In [302]:
scores = cross_val_score(RandomForestRegressor(100, max_depth=None), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.83232479  0.84234057  0.82456352]
0.833076292241
0.00727687787112


In [303]:
scores = cross_val_score(RandomForestRegressor(100, max_depth=15), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.84480065  0.83339436  0.81948089]
0.832558634403
0.0103536293373


In [306]:
scores = cross_val_score(RandomForestRegressor(100, max_depth=12), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.84230125  0.84026807  0.82606753]
0.836212283281
0.00722128827981


In [305]:
scores = cross_val_score(RandomForestRegressor(100, max_depth=10), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.82900214  0.83565789  0.82290252]
0.829187520419
0.00520900793899


In [310]:
params = {'n_estimators': 100}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.84919958  0.83410134  0.80653142]
0.82994411611
0.0176654990323


In [311]:
params = {'n_estimators': 200}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.85834383  0.84526779  0.807821  ]
0.837144205245
0.0214107980611


In [312]:
params = {'n_estimators': 300}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86356984  0.84781064  0.80441386]
0.838598112194
0.0250134664899


In [313]:
params = {'n_estimators': 500}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86747855  0.84982819  0.80352634]
0.840277693026
0.0269676380948


In [314]:
params = {'n_estimators': 666}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86641821  0.84908533  0.80368104]
0.839728193757
0.0264531661697


In [315]:
params = {'n_estimators': 600}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86590738  0.85088148  0.80008195]
0.838956933151
0.0281649065607


In [364]:
params = {'n_estimators': 600, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87208529  0.85081425  0.81085309]
0.844584207304
0.0253831395665


In [365]:
params = {'n_estimators': 300, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86829028  0.85028311  0.80984489]
0.842806092893
0.0244389741667


In [366]:
params = {'n_estimators': 200, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86958425  0.84880533  0.8103415 ]
0.842910359575
0.0245423296572


In [367]:
params = {'n_estimators': 200, 'max_depth':5}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.85692664  0.84002017  0.78111598]
0.826020931105
0.0324940843419


In [368]:
params = {'n_estimators': 700, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87746777  0.85067103  0.81381337]
0.847317391685
0.0260947728903


In [369]:
params = {'n_estimators': 1000, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87404269  0.85084484  0.80878474]
0.844557423996
0.0270098600407


In [370]:
params = {'n_estimators': 888, 'max_depth':4}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87525401  0.85203513  0.81231287]
0.846533999411
0.0259883756135


In [316]:
params = {'n_estimators': 600, 'max_depth':2}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87354901  0.85247506  0.80706674]
0.844363602576
0.027740701252


In [317]:
params = {'n_estimators': 700, 'max_depth':2}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.8752715   0.85331158  0.81066214]
0.846415073692
0.0268236657135


In [318]:
params = {'n_estimators': 888, 'max_depth':2}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87543448  0.85565737  0.8079698 ]
0.846353880718
0.0283170974111


In [359]:
params = {'n_estimators': 1000, 'max_depth':2}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87542978  0.85562066  0.80884644]
0.846632296444
0.0279156865447


In [360]:
params = {'n_estimators': 1000, 'max_depth':3}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86786678  0.85086573  0.8031216 ]
0.840618037631
0.027407371921


In [319]:
params = {'n_estimators': 888, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86450854  0.84279917  0.85309296]
0.853466892334
0.00886675745556


In [321]:
params = {'n_estimators': 888}
scores = cross_val_score(AdaBoostRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.10925884  0.02875245  0.53539137]
0.224467551866
0.22229939982


In [322]:
params = {'n_estimators': 777}
scores = cross_val_score(AdaBoostRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.13549915  0.0656864   0.5553354 ]
0.252173650873
0.216254079293


In [325]:
params = {'n_estimators': 1000, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.86727646  0.84397538  0.85336329]
0.854871711193
0.00957223340941


In [326]:
params = {'n_estimators': 1200, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87029286  0.84592558  0.85564097]
0.857286469614
0.0100157163159


In [327]:
params = {'n_estimators': 1400, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87317326  0.84591354  0.85569217]
0.858259657546
0.0112758449


In [334]:
params = {'n_estimators': 1600, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87579618  0.84787489  0.85608119]
0.859917421588
0.0117171420647


In [329]:
params = {'n_estimators': 1800, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87795925  0.84698918  0.85831164]
0.861086693785
0.0127948432795


In [330]:
params = {'n_estimators': 2000, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.87973784  0.84722323  0.85729653]
0.861419197126
0.0135903709951


In [331]:
params = {'n_estimators': 2200, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.88126799  0.84783497  0.85781834]
0.862307101227
0.014013167848


In [332]:
params = {'n_estimators': 2400, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.8825581   0.84846721  0.85930876]
0.86344469058
0.0142215036666


In [140]:
params = {'n_estimators': 5000, 'max_depth':1}
scores = cross_val_score(GradientBoostingRegressor(**params), sample_trainX, sample_trainY)
print scores
print np.mean(scores)
print np.std(scores)

[ 0.88870327  0.84988769  0.86230754]
0.866966167533
0.0161851688886


In [336]:
params = {'n_estimators': 3000, 'max_depth':1}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
gbr.score(sample_trainX,sample_trainY)

0.9533831068471843

In [338]:
params = {'n_estimators': 1600, 'max_depth':1}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
gbr.score(sample_trainX,sample_trainY)

0.94257934298836632

In [351]:
params = {'n_estimators': 1600, 'max_depth':1}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.8209407312872649

In [352]:
params = {'n_estimators': 2400, 'max_depth':1}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.949781305332


0.8032686794685913

In [353]:
params = {'n_estimators': 3000, 'max_depth':1}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.953383106847


0.8006455656425373

In [355]:
params = {'n_estimators': 3000, 'max_depth':2}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.997025445715


0.4097701965628836

In [356]:
params = {'n_estimators': 2000, 'max_depth':2}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.994189976054


0.4917444061553089

In [357]:
params = {'n_estimators': 1000, 'max_depth':2}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.985239505045


0.5741488088999931

In [358]:
params = {'n_estimators': 1000, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.996728555719


0.40699437687917683

In [361]:
params = {'n_estimators': 500, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.988829646961


0.5260184266078598

In [362]:
params = {'n_estimators': 666, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.992900930099


0.48519406829139844

In [363]:
params = {'n_estimators': 333, 'max_depth':4}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<1]=0
evaluate(sample_trainY,prediction)

0.992868356957


0.4566300980020484

In [372]:
params = {'n_estimators': 700, 'max_depth':4}
gbr = GradientBoostingRegressor(**params).fit(sample_trainX,sample_trainY)
print gbr.score(sample_trainX,sample_trainY)
prediction = pd.Series(np.floor(gbr.predict(sample_trainX)),index=sample_trainY.index)
prediction[prediction<3]=0
evaluate(sample_trainY,prediction)

0.998262600445


0.45444835015029017

In [373]:
trainX, testX, trainY, testY = train_test_split(sample_trainX,sample_trainY)

In [374]:
params = {'n_estimators': 700, 'max_depth':4}
gbr = GradientBoostingRegressor(**params).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
# prediction[prediction<3]=0
evaluate(testY,prediction)

0.998931044445


0.9628558854962641

In [376]:
prediction[prediction<1]=0
evaluate(testY,prediction)

1.103515434427381

In [377]:
params = {'n_estimators': 666, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
# prediction[prediction<3]=0
evaluate(testY,prediction)

0.995240496178


1.014553732914814

In [378]:
params = {'n_estimators': 666, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
prediction[prediction<3]=0
evaluate(testY,prediction)

0.995240496178


1.1812284668022275

In [389]:
params = {'n_estimators': 666, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
prediction[prediction<200]=0
evaluate(testY,prediction)

0.995240496178


0.9685886082386577

In [390]:
prediction[prediction<300]=0
evaluate(testY,prediction)

0.9888775835229554

In [391]:
params = {'n_estimators': 666, 'max_depth':3}
gbr = GradientBoostingRegressor(**params).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
prediction[prediction<100]=0
evaluate(testY,prediction)

0.995240496178


0.9298437725185077

In [395]:
# params = {'n_estimators': 100, 'max_depth':12}
gbr = RandomForestRegressor(100, max_depth=12).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
# prediction[prediction<100]=0
evaluate(testY,prediction)

0.978959514333


0.9141660397864777

In [397]:
# params = {'n_estimators': 100, 'max_depth':12}
gbr = RandomForestRegressor(100, max_depth=12).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
prediction[prediction<3]=0
evaluate(testY,prediction)

0.977740712774


1.1239701280277377

In [401]:
# params = {'n_estimators': 100, 'max_depth':12}
gbr = RandomForestRegressor(200, max_depth=12).fit(trainX,trainY)
print gbr.score(trainX,trainY)
prediction = pd.Series(np.floor(gbr.predict(testX)),index=testY.index)
# prediction[prediction<1]=0
evaluate(testY,prediction)

0.979098134466


0.9142699088744388

In [400]:
prediction[prediction<1]

Series([], dtype: float64)