In [28]:
import pandas as pd
import numpy as np
from sklearn import tree, ensemble
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score, KFold
import sys

In [29]:
df = pd.read_csv('../data/sample_data_cleaned.csv', sep=',', header=0, index_col=0)

features = ['site_id', 'strategy_id', 'name', 'goal', 'price', 'avg_bid', 'max_bid', 'win_rate_site', 'win_rate_strat', 'hist_zscore', 'overlap', 'target']
df_subset = df[features]

#define training and testing set
df['is_train'] = np.random.uniform(0,1, len(df)) > 0.25
train = df[df['is_train']==True][features]
test = df[df['is_train']==False][features]

#vectorize the feature set and one-hot encode adv_vertical
vec = DictVectorizer(sparse=False)
feats_train = train.T.to_dict().values()
features_train = vec.fit_transform(feats_train)
feats_test = test.T.to_dict().values()
features_test = vec.fit_transform(feats_test)

#convert back to dataframe and check for null values
train = pd.DataFrame(features_train, columns = vec.get_feature_names())
test = pd.DataFrame(features_test, columns = vec.get_feature_names())

x_train = train.drop(['target'], axis = 1)
y_train = train['target']
x_test = test.drop(['target'], axis = 1)
y_test = test['target']

In [30]:
rf = ensemble.RandomForestClassifier(max_features='sqrt', max_depth=6, min_samples_leaf=20, n_estimators=100, oob_score=True).fit(x_train, y_train)
print "test accuracy: %0.3f \n" % rf.score(x_test, y_test)
print "train accuracy: %0.3f \n" % rf.score(x_train, y_train)
print "oob score: %0.3f \n" % rf.oob_score_
print ' '.join([x[0] + ': ' + str(x[1]) + '\n' for x in zip(x_train.columns, rf.feature_importances_)])
print "cross-validation scores: %s \n" % ', '.join([str(x) for x in cross_val_score(rf, x_test,y_test)])

test accuracy: 0.986 

train accuracy: 0.969 

oob score: 0.969 

avg_bid: 0.0578468762968
 goal: 0.0244124363884
 hist_zscore: 0.372561468687
 max_bid: 0.0190890046501
 name=Andrew: 0.0138828653585
 name=Edward: 0.0759293160668
 name=Ian: 0.00107545524508
 name=Josie: 0.00289150256529
 name=Lori: 0.0
 name=Nicole: 9.74055809528e-05
 name=Travis: 0.00260494499071
 overlap: 0.257049894484
 price: 0.0516344433055
 site_id: 0.0270675665029
 strategy_id: 0.053724778798
 win_rate_site: 0.018005041491
 win_rate_strat: 0.0221269995894

cross-validation scores: 0.982456140351, 0.988235294118, 0.988235294118 



In [36]:
kf = KFold(n_splits=5, shuffle=True)

'''
#vectorize the feature set and hot encode adv_vertical
vec = DictVectorizer(sparse=False)
feats= df_subset.T.to_dict().values()
features_coded = vec.fit_transform(feats)

#convert back to dataframe and check for null values
features_df = pd.DataFrame(features_coded, columns = vec.get_feature_names())'''
df_subset.dropna(inplace=True)

for train_index, test_index in kf.split(df_subset):
    x_train, x_test = features_df.loc[train_index, feature_cols], features_df.loc[test_index, feature_cols]
    y_train, y_test = features_df.loc[train_index, 'target'], features_df.loc[test_index, 'target']
    
    rf = ensemble.RandomForestClassifier(max_features='sqrt', max_depth=6, min_samples_leaf=20, n_estimators=100, oob_score=True).fit(x_train, y_train)
    print "test accuracy: %0.3f \n" % rf.score(x_test, y_test)
    print "train accuracy: %0.3f \n" % rf.score(x_train, y_train)
    print "oob score: %0.3f \n" % rf.oob_score_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
feature_importances = []
for i, est_tree in enumerate(rf.estimators_):
    feature_importances.append(numpy.append(est_tree.feature_importances_, est_tree.score(x_test, y_test)))

fig = figure(1, figsize=(9,6))
ax = fig.add_subplot(111)
bp = ax.boxplot(imp_by_feat)
_ = ax.set_xticklabels(features, rotation=90)