# SLU17 - Data Sufficiency and Selection

In this notebook you will cover the following:

- Feature Importance
- Single Factor Analysis
- Learning Curves

In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
from base64 import b64encode # just for grading purposes

%matplotlib inline

In [4]:
# load up a classification dataset

X = pd.read_csv('data/exercise_X.csv')
y = pd.read_csv('data/exercise_y.csv')['label']
# give X a quick look
X.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,id
0,1.588078,0.078159,-3.352761,-0.456998,-1.53705,-1.54099,0.697395,-2.263426,Meerkat,0
1,0.600816,0.862691,-0.282597,2.212205,-0.167844,-0.530334,0.34419,0.116734,Meerkat,1
2,1.322777,1.686777,-2.026925,2.669834,-0.977835,-2.636982,-1.325553,0.698066,Meerkat,2
3,-0.617705,0.211531,1.131231,0.441285,-0.237977,0.147819,0.901299,1.14688,Meerkat,3
4,1.368048,0.847212,-2.562404,1.410269,0.123509,-1.814067,-0.561118,-1.243117,Meerkat,4


In [5]:
# looks like a balanced binary target
y.value_counts()

0    502
1    498
Name: label, dtype: int64

# Find the first obviously useless feature

Can you determine which of the features contains all uniques and therefore cannot have any predictive power?

In [6]:
# Use this cell to determine which of the features serves as a categorical
# feature and contains all uniques


In [7]:
# set the variable feature_all_unique to the name of the feature
# that contains all uniques
feature_all_unique = None

# YOUR CODE HERE
feature_all_unique = 'id'

In [8]:
### BEGIN TESTS
assert b64encode(feature_all_unique.encode()) == b'aWQ='
### END TESTS

# Find the second obviously useless feature

This one doesn't contain all uniques but based upon some Single Factor Analysis you should be able to determine which feature isn't worth
bothering with.

In [None]:
# use this cell to do some more SFA on other features to determine
# which of them is useless


In [9]:
# Use this cell to determine the other obviously useless feature
other_useless_feature = None

# YOUR CODE HERE
other_useless_feature = 'feature_8'

In [10]:
### BEGIN TESTS
assert b64encode(other_useless_feature.encode()) == b'ZmVhdHVyZV84'
### END TESTS

In [11]:
# now drop the features that you determined to be useless

# YOUR CODE HERE
X = X.drop(['id', 'feature_8'], axis=1)

In [12]:
### BEGIN TESTS
assert b64encode(str(sorted(X.columns)).encode()) == b'WydmZWF0dXJlXzAnLCAnZmVhdHVyZV8xJywgJ2ZlYXR1cmVfMicsICdmZWF0dXJlXzMnLCAnZmVhdHVyZV80JywgJ2ZlYXR1cmVfNScsICdmZWF0dXJlXzYnLCAnZmVhdHVyZV83J10='
### END TESTS

# Find the rest of the useless features

Single Factor Analysis isn't likely to do much in helping us to determine
which of the rest of the features are useless. We'll need to use feature_importances in order to find the rest of these bad boys

In [13]:
# in order to use feature importances you will need to import a tree
# based algorithm. Let's use a decision tree classifier

# import your decision tree classifier here the exact same way that
# it is done in the learning notebooks

# YOUR CODE HERE
from sklearn.tree import DecisionTreeClassifier

In [14]:
### BEGIN TESTS
assert b64encode(str(DecisionTreeClassifier).encode()) == b'PGNsYXNzICdza2xlYXJuLnRyZWUudHJlZS5EZWNpc2lvblRyZWVDbGFzc2lmaWVyJz4='
### END TESTS

In [17]:
# Now let's train the classifier and get the feature importances

# Create your classifier, assign it to the clf variable and then
# train it on X and y
clf = None

# once the classifier is trained, set the feature importances here
# make it a pandas series with the index being the column names
# so that we can visualize the results
feature_importances = None

# set the random_state=1 and max_depth=5 or the tests won't pass!
# YOUR CODE HERE
clf = DecisionTreeClassifier(random_state=1, max_depth=5)
clf.fit(X, y)
feature_importances = pd.Series(clf.feature_importances_)

In [18]:
### BEGIN TESTS
assert isinstance(clf, sklearn.tree.DecisionTreeClassifier), 'The classifier must be a DecisionTreeClassifier'
assert clf.random_state == 1, 'random_state must be 1'
assert clf.max_depth == 5, 'max_depth must be 5'
assert feature_importances.equals(pd.Series(clf.feature_importances_, index=X.columns)), 'feature_importances must be a pandas series with the X columns as the index'
### END TESTS

AttributeError: 'numpy.ndarray' object has no attribute 'equals'

In [None]:
feature_importances.plot.barh();

In [None]:
# now remove 3 additional useless features

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert b64encode(str(sorted(X.columns)).encode()) == b'WydmZWF0dXJlXzInLCAnZmVhdHVyZV8zJywgJ2ZlYXR1cmVfNCcsICdmZWF0dXJlXzYnLCAnZmVhdHVyZV83J10='
### END TESTS

# The learning curve

Okay now that we have gotten rid of all those useless features, let's focus on getting a sense for how much data we need in order to have
reasonable performance.

In [None]:
# Now create a dataframe that has a single feature that is the
# cross validation score in order to help us understand
# how increasing amounts of data affect the performance

# HINT: just use the snippet from the Learning Notebook
cross_val_scores = None

# instantiate a classifier that you will inspect the learning rate of
clf = DecisionTreeClassifier(max_depth=5, random_state=1)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
# round in order to compensate for implementation details
expected = [round(x, 3) for x in [
    0.7139696510593271,
    0.7659680638722555,
    0.7669600557920303,
    0.7719741240410745,
    0.7729961763220548
]]

assert cross_val_scores is not None, 'Did you forget to set the cross_val_scores variable?'
assert 'Cross-validation Score' in cross_val_scores.columns, 'Did you forget'
actual = [round(x, 3) for x in cross_val_scores['Cross-validation Score']]
assert bool(actual == expected), 'Are your numbers off?'
### END TESTS

In [None]:
cross_val_scores.plot.line(
    title='Decision Tree Learning Curve',
    ylim=(0.7, 0.8)
);

In [None]:
# Now select the minimum training set size that this particular classifier
# seems to need in order to maximize it's learning from this dataset.

min_train_set_size = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert min_train_set_size <= int(b64decode(b'MzAw'))
### END TESTS