In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
import pandas as pd

In [9]:
#Import data
X = pd.read_csv('train.csv')
Y = X.pop('Survived')

In [10]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
#Impute Age with mean
X["Age"].fillna(X.Age.mean(), inplace=True)
#Confirm code is correct
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
#Get numberic variables by selecting variables that are not object data types
numeric_variables = list(X.dtypes[X.dtypes != "object"].index)
X[numeric_variables].head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.25
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.925
3,4,1,35.0,1,0,53.1
4,5,3,35.0,0,0,8.05


In [14]:
'''Lets build our first model. Set the out of bounce score to True. Its a good idea to increase n estimators to a 
number higher than the default. In this case the oob predictions will be based on a forest of 33 trees.
I set random state to 42 so you all replicate the model excatly'''
model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)

#I only use numeric variables because I have yet to dummy out the categorical variables 
model.fit(X[numeric_variables], Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=42,
           verbose=0, warm_start=False)

In [15]:
'''For regression, the oob score attribute gives the R^2 based on the oob predictions.
We want to use c-stat, but I mentioned its for awareness. By the way attributes in sklearn that have a trailing 
underscore are only available after the the model has been'''
model.oob_score_

0.1361695005913669

In [41]:
y_oob = model.oob_prediction_
print "c-stat: '', roc_auc_score(y, y_oob)

SyntaxError: EOL while scanning string literal (<ipython-input-41-6c218288cd24>, line 2)

In [45]:
'Here is a simple function to show descriptive stats on the categorical variables'
def describe_categorical(X):
    """
    Just like .describe(), but returns the results for
    categorical variables only
    """
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dytpes == 'object']].describe().to_html()))
    

In [46]:
describe_categorical(X)

AttributeError: 'DataFrame' object has no attribute 'dytpes'

In [47]:
#Drop the variables I don't feel like dealing with for this tutorial
X.drop(["Name", "Ticket", "PassengerId"], axis=1, inplace=True)

In [48]:
#Change the Cablin variable to be only the first letter or None
def clean_cabin(x):
    try:
        return x[0]
    except TypeError:
        return "None"
    
X["Cabin"] = X.Cabin.apply(clean_cabin)

In [None]:
categorical_variables = ['Sex', 'Cabin', 'Embarked']

for variable in categorical_variables:
    #Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    #Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    #Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [50]:
#Look at all the colums in the dataset
def printall(X, max_rows=10):
    from IPython.display import display, HTML
    display(HTML(X.to_html(max_rows=max_rows)))
    
printall(X)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.000000,1,0,7.2500,,S
1,1,female,38.000000,1,0,71.2833,C,C
2,3,female,26.000000,0,0,7.9250,,S
3,1,female,35.000000,1,0,53.1000,C,S
4,3,male,35.000000,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,,S
887,1,female,19.000000,0,0,30.0000,B,S
888,3,female,29.699118,1,2,23.4500,,S
889,1,male,26.000000,0,0,30.0000,C,C


In [52]:
model = RandomForestRegressor(100, oob_score=True, n_jobs=-1, random_state=42)
model.fit(X, y)
print "C-state:", roc_auc_score(y, model.oob_prediction_)

SyntaxError: invalid syntax (<ipython-input-52-e246022a6a0f>, line 3)

In [53]:
model.feature_importances_

array([ 0.31538584,  0.08292152,  0.233442  ,  0.04874595,  0.03235287,
        0.28715181])

In [55]:
# Simple version that shows all of the variables
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind="barh", figsize=(7,6));

ValueError: Wrong number of items passed 6, placement implies 8