In [1]:
import pydot
import pandas as pd
from sklearn.externals.six import StringIO 
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#from sklearn.ensemble import RandomForestClassifier

### Tree / Forest Challenges  

You can examine the decision paths of an `sklearn` tree by generating `pydot` graphs as in the `sklearn` [documentation](http://scikit-learn.org/stable/modules/tree.html). It's sometimes tricky to get `pydot` working; see below for a possible install plan.  


### Challenge 1  

For the house representatives data set, fit and evaluate a decision tree classifier. Examine the rules your tree uses.  

In [2]:
col_names = ["Party"] + ["V%d" %x for x in range(1,17)]
print col_names
votes = pd.read_csv("house-votes-84.data", sep=",", na_values="?", names=col_names)
votes = votes.replace(["y","n"], [1,0])
votes = votes.fillna(votes.mean())
votes.head()

['Party', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16']


Unnamed: 0,Party,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,republican,0.0,1,0,1.0,1.0,1,0,0,0,1,0.362319,1.0,1,1,0,1.0
1,republican,0.0,1,0,1.0,1.0,1,0,0,0,0,0.0,1.0,1,1,0,0.812689
2,democrat,0.44208,1,1,0.417453,1.0,1,0,0,0,0,1.0,0.0,1,1,0,0.0
3,democrat,0.0,1,1,0.0,0.504762,1,0,0,0,0,1.0,0.0,1,0,0,1.0
4,democrat,1.0,1,1,0.0,1.0,1,0,0,0,0,1.0,0.423267,1,1,1,1.0


In [3]:
y = votes["Party"]
X = votes.iloc[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [4]:
dectree = DecisionTreeClassifier()
dectree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [5]:
dot_data = StringIO() 
tree.export_graphviz(dectree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("party.pdf") 

True

### Challenge 2  

Fit and evaluate a decision tree classifier for your movie dataset. Examine the rules your tree uses.


In [6]:
movies = pd.read_csv("2013_movies.csv", sep=",")
moviesb = movies.dropna()
Xm = moviesb[["Budget", "DomesticTotalGross", "Runtime"]]
ym = moviesb["Rating"]
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.25, random_state=2)

In [7]:
movietree = DecisionTreeClassifier()
movietree.fit(Xm_train, ym_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [8]:
dot_data = StringIO() 
tree.export_graphviz(movietree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("movie.pdf") 

True

### Challenge 3 (Optional but recommended)  

Tackle the [Titanic Survivors kaggle competition](https://www.kaggle.com/c/titanic-gettingStarted) with decision trees. Look at your splits; how does your tree decide?


In [9]:
titanic = pd.read_csv("titanictrain.csv", sep=",")
titanic = titanic.dropna()
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S


In [10]:
sex2 = pd.get_dummies(titanic["Sex"])["female"]
sex2.head()

1     1
3     1
6     0
10    1
11    1
Name: female, dtype: float64

In [11]:
titanic["Sex10"] = sex2
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex10
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S,1
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S,1


In [12]:
Xt_train = titanic[["Pclass","Sex10","Age","SibSp", "Parch"]]
yt_train = titanic["Survived"]
Xt_train.head()

Unnamed: 0,Pclass,Sex10,Age,SibSp,Parch
1,1,1,38,1,0
3,1,1,35,1,0
6,1,0,54,0,0
10,3,1,4,1,1
11,1,1,58,0,0


In [13]:
titanictree = DecisionTreeClassifier()
titanictree.fit(Xt_train, yt_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [14]:
dot_data = StringIO() 
tree.export_graphviz(titanictree, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("titanic.pdf") 

True