In [1]:
import pandas as pd
from sklearn import tree
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import sys

# Create train, test

In [2]:
df = pd.read_csv('../data/sample_data_cleaned.csv', sep=',', header=0, index_col=0)
feature_cols = ['adv_vertical', 'goal', 'price', 'avg_bid', 'max_bid', 'win_rate_site_table', 'win_rate_strat_table', 'hist_zscore', 'overlap']
target = 'target'

df[['adv_vertical', 'goal', 'price', 'avg_bid', 'max_bid', 'win_rate_site_table', 'win_rate_strat_table', 'hist_zscore', 'overlap', 'target']].head()

Unnamed: 0,adv_vertical,goal,price,avg_bid,max_bid,win_rate_site_table,win_rate_strat_table,hist_zscore,overlap,target
0,Travel,0.0,3.95,2.75,300.0,0.450094,0.249479,2.708366,0.001066,0
1,Travel,0.0,3.95,2.75,300.0,0.15805,0.249479,1.188635,0.000703,0
2,Travel,0.0,3.95,2.75,300.0,0.360591,0.249479,1.503285,0.000873,0
3,Travel,0.0,3.95,2.75,300.0,0.113717,0.249479,35.153628,0.004614,0
4,Travel,0.0,3.95,2.75,300.0,0.019308,0.249479,-0.091378,0.000344,0


In [3]:
#define training and testing set
df['is_train'] = np.random.uniform(0,1, len(df)) > 0.25
x_train, y_train = df[df['is_train']==True][feature_cols], df[df['is_train']==True][target]
x_test, y_test = df[df['is_train']==False][feature_cols], df[df['is_train']==False][target]

In [4]:
#vectorize the feature set and hot encode adv_vertical
vec = DictVectorizer(sparse=False)
feats_train = x_train.T.to_dict().values()
features_train = vec.fit_transform(feats_train)
feats_test = x_test.T.to_dict().values()
features_test = vec.fit_transform(feats_test)

#convert back to dataframe and check for null values
x_train = pd.DataFrame(features_train, columns = vec.get_feature_names())
x_test = pd.DataFrame(features_test, columns = vec.get_feature_names())

# Make decision tree

In [5]:
clf = tree.DecisionTreeClassifier(max_depth=6, min_samples_leaf = 15).fit(x_train, y_train)
print "test accuracy: %0.3f \n" % clf.score(x_test, y_test)
print "train accuracy: %0.3f \n" % clf.score(x_train, y_train)
print ' '.join([x[0] + ': ' + str(x[1]) + '\n' for x in zip(x_train.columns, clf.feature_importances_)])

test accuracy: 0.976 

train accuracy: 0.982 

adv_vertical=Automotive: 0.0
 adv_vertical=Finance: 0.0
 adv_vertical=Home & Garden: 0.0
 adv_vertical=Philanthropy: 0.0
 adv_vertical=Travel: 0.0
 adv_vertical=Utilities: 0.0
 avg_bid: 0.44712359503
 goal: 0.0
 hist_zscore: 0.254006475622
 max_bid: 0.293528662673
 overlap: 0.00295811499769
 price: 0.0
 win_rate_site_table: 0.0
 win_rate_strat_table: 0.00238315167777



## Plot feature importances

## Draw tree

- There are documented difficulties installing pydot
- When installing this package:
   
   $ pip install .
   
   make note of where pydotplus is installed.  Add that path to your system path, as in the first line below:

In [6]:
import sys
sys.path.append('/anaconda/lib/python2.7/site-packages')
import pydotplus

#### Now proceed to draw the tree:

In [7]:
from sklearn.externals.six import StringIO

dot_data = StringIO()
tree.export_graphviz(clf, out_file = dot_data, feature_names = x_train.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('../figures/draw_a_tree.pdf')

True