# Quiz: Evaluate the accuracy

In [1]:
# In this and the following exercises, you'll be adding train test splits to the data
# to see how it changes the performance of each classifier
#
# The code provided will load the Titanic dataset like you did in project 0, then train
# a decision tree (the method you used in your project) and a Bayesian classifier (as
# discussed in the introduction videos). You don't need to worry about how these work for
# now. 
#
# What you do need to do is import a train/test split, train the classifiers on the
# training data, and store the resulting accuracy scores in the dictionary provided.

import numpy as np
import pandas as pd

In [2]:
# Load the dataset
X = pd.read_csv('titanic_data.csv')
# Limit to numeric data
X = X._get_numeric_data()
# Separate the labels
y = X['Survived']
# Remove labels from the inputs, and age due to missing data
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [6]:
print '> Data'
print X[:5]
print '> Target'
print y[:5]

> Data
   PassengerId  Pclass  SibSp  Parch     Fare
0            1       3      1      0   7.2500
1            2       1      1      0  71.2833
2            3       3      0      0   7.9250
3            4       1      1      0  53.1000
4            5       3      0      0   8.0500
> Target
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [9]:
from sklearn import cross_validation as cv

In [22]:
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)
print x_train.shape, y_train.shape

# The decision tree classifier
clf1 = DecisionTreeClassifier()
clf1.fit(x_train, y_train)

dt_score = accuracy_score(y_test, clf1.predict(x_test))
print "Decision Tree has accuracy: ", dt_score

# The naive Bayes classifier
clf2 = GaussianNB()
clf2.fit(x_train, y_train)

nb_score = accuracy_score(y_test, clf2.predict(x_test))
print "GaussianNB has accuracy: ", nb_score

answer = { 
 "Naive Bayes Score": dt_score, 
 "Decision Tree Score": nb_score
}

(668, 5) (668L,)
Decision Tree has accuracy:  0.62331838565
GaussianNB has accuracy:  0.641255605381


# Quiz: Build a confusion matrix

In [None]:
# In this exercise, we'll use the Titanic dataset as before, train two classifiers and
# look at their confusion matrices. Your job is to create a train/test split in the data
# and report the results in the dictionary at the bottom.

import numpy as np
import pandas as pd

# Load the dataset
#from sklearn import datasets

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB

In [24]:
X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

In [28]:
# TODO: split the data into training and testing sets,
# using the default settings for train_test_split (or test_size = 0.25 if specified).
# Then, train and test the classifiers with your newly split data instead of X and y.

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)
print x_train.shape, y_train.shape

clf1 = DecisionTreeClassifier()
clf1.fit(x_train,y_train)
print "Confusion matrix for this Decision Tree:\n",confusion_matrix(y_test,clf1.predict(x_test))

clf2 = GaussianNB()
clf2.fit(x_train,y_train)
print "GaussianNB confusion matrix:\n",confusion_matrix(y_test,clf2.predict(x_test))

#TODO: store the confusion matrices on the test sets below

confusions = {
 "Naive Bayes": confusion_matrix(y_test,clf2.predict(x_test)),
 "Decision Tree": confusion_matrix(y_test,clf1.predict(x_test))
}

(668, 5) (668L,)
Confusion matrix for this Decision Tree:
[[91 39]
 [46 47]]
GaussianNB confusion matrix:
[[110  20]
 [ 53  40]]


# Interpreting Confusion Matrix
Were the classifiers more likely to make mistakes that were false positives (expecting people to survive who did not survive) or false negatives (expecting people to die who did not die)?
***
False negatives were more common.
> While the difference is fairly small for decision trees, naive Bayes seems to produce far more false negatives than false positives!

# Recall vs Precision

In [31]:
# As with the previous exercises, let's look at the performance of a couple of classifiers
# on the familiar Titanic dataset. Add a train/test split, then store the results in the
# dictionary provided.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

In [35]:
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)
print x_train.shape, y_train.shape

clf1 = DecisionTreeClassifier()
clf1.fit(x_train, y_train)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf1.predict(x_test)),
                                                                  precision(y_test,clf1.predict(x_test)))

clf2 = GaussianNB()
clf2.fit(x_train, y_train)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y_test,clf2.predict(x_test)),
                                                               precision(y_test,clf2.predict(x_test)))

results = {
  "Naive Bayes Recall": recall(y_test,clf2.predict(x_test)),
  "Naive Bayes Precision": precision(y_test,clf2.predict(x_test)),
  "Decision Tree Recall": recall(y_test,clf1.predict(x_test)),
  "Decision Tree Precision": precision(y_test,clf1.predict(x_test))
}

(668, 5) (668L,)
Decision Tree recall: 0.54 and precision: 0.59
GaussianNB recall: 0.41 and precision: 0.73


# Compare Algorithms
In which ways is the decision tree outperforming Gaussian Naive Bayes?
***
The **Decision Tree** has a better recall, but worse precision.
> **Naive Bayes** seems to do pretty well on precision, but rather poorly on recall!

# Quiz: Compute F1 Score

In [36]:
# As usual, use a train/test split to get a reliable F1 score from two classifiers, and
# save it the scores in the provided dictionaries.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB

In [47]:
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)
print x_train.shape, y_train.shape

clf1 = DecisionTreeClassifier()
clf1.fit(x_train, y_train)
print "Decision Tree F1 score: {:.2f}".format(f1_score(y_test, clf1.predict(x_test)))

clf2 = GaussianNB()
clf2.fit(x_train, y_train)
print "GaussianNB F1 score: {:.2f}".format(f1_score(y_test, clf2.predict(x_test)))

F1_scores = {
 "Naive Bayes": f1_score(y_test, clf2.predict(x_test)),
 "Decision Tree": f1_score(y_test, clf1.predict(x_test))
}

(668, 5) (668L,)
Decision Tree F1 score: 0.51
GaussianNB F1 score: 0.45


# Quiz: Compute Mean Absolute Error

In [48]:
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

In [49]:
linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

In [50]:
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

from sklearn import cross_validation as cv

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)

reg1 = DecisionTreeRegressor()
reg1.fit(x_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mae(y_test,reg1.predict(x_test)))

reg2 = LinearRegression()
reg2.fit(x_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mae(y_test,reg2.predict(x_test)))

results = {
 "Linear Regression": mae(y_test,reg2.predict(x_test)),
 "Decision Tree": mae(y_test,reg1.predict(x_test))
}

Decision Tree mean absolute error: 14.27
Linear regression mean absolute error: 10.19


# Quiz: Compute Mean Square Error

In [51]:
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression

In [52]:
# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

from sklearn import cross_validation as cv

x_train, x_test, y_train, y_test = cv.train_test_split(X, y)

reg1 = DecisionTreeRegressor()
reg1.fit(x_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mse(y_test, reg1.predict(x_test)))

reg2 = LinearRegression()
reg2.fit(x_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mse(y_test, reg2.predict(x_test)))

results = {
 "Linear Regression": mse(y_test, reg2.predict(x_test)),
 "Decision Tree": mse(y_test, reg1.predict(x_test))
}

Decision Tree mean absolute error: 81.27
Linear regression mean absolute error: 611.53
