# SLU17 - Data Sufficiency and Selection

In this notebook you will cover the following:

- Feature Importance
- Single Factor Analysis
- Learning Curves

In [None]:
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
from base64 import b64encode, b64decode # just for grading purposes

%matplotlib inline

In [None]:
# load up a classification dataset

X = pd.read_csv('data/exercise_X.csv')
y = pd.read_csv('data/exercise_y.csv')['label']
# give X a quick look
X.head()

In [None]:
# looks like a balanced binary target
y.value_counts()

# Find the first obviously useless feature

Can you determine which of the features contains all uniques and therefore cannot have any predictive power?

In [None]:
# Use this cell to determine which of the features serves as a categorical
# feature and contains all uniques


In [None]:
# set the variable feature_all_unique to the name of the feature
# that contains all uniques
feature_all_unique = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert b64encode(feature_all_unique.encode()) == b'aWQ='
### END TESTS

# Find the second obviously useless feature

This one doesn't contain all uniques but based upon some Single Factor Analysis you should be able to determine which feature isn't worth
bothering with.

In [None]:
# use this cell to do some more SFA on other features to determine
# which of them is useless


In [None]:
# Use this cell to determine the other obviously useless feature
other_useless_feature = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert b64encode(other_useless_feature.encode()) == b'ZmVhdHVyZV84'
### END TESTS

In [None]:
# now drop the features that you determined to be useless

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert b64encode(str(sorted(X.columns)).encode()) == b'WydmZWF0dXJlXzAnLCAnZmVhdHVyZV8xJywgJ2ZlYXR1cmVfMicsICdmZWF0dXJlXzMnLCAnZmVhdHVyZV80JywgJ2ZlYXR1cmVfNScsICdmZWF0dXJlXzYnLCAnZmVhdHVyZV83J10='
### END TESTS

# Find the rest of the useless features

Single Factor Analysis isn't likely to do much in helping us to determine
which of the rest of the features are useless. We'll need to use feature_importances in order to find the rest of these bad boys

In [None]:
# in order to use feature importances you will need to import a tree
# based algorithm. Let's use a decision tree classifier

# import your decision tree classifier here the exact same way that
# it is done in the learning notebooks

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert DecisionTreeClassifier, 'Import that bad boy from the sklearn.tree module'
### END TESTS

In [None]:
# Now let's train the classifier and get the feature importances

# Create your classifier, assign it to the clf variable and then
# train it on X and y
clf = None

# once the classifier is trained, set the feature importances here
# make it a pandas series with the index being the column names
# so that we can visualize the results
feature_importances = None

# set the random_state=1 and max_depth=5 or the tests won't pass!
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert isinstance(clf, sklearn.tree.DecisionTreeClassifier), 'The classifier must be a DecisionTreeClassifier'
assert clf.random_state == 1, 'random_state must be 1'
assert clf.max_depth == 5, 'max_depth must be 5'
assert feature_importances.equals(pd.Series(clf.feature_importances_, index=X.columns)), 'feature_importances must be a pandas series with the X columns as the index'
### END TESTS

In [None]:
feature_importances.plot.barh();

In [None]:
# now remove 3 additional useless features

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert b64encode(str(sorted(X.columns)).encode()) == b'WydmZWF0dXJlXzInLCAnZmVhdHVyZV8zJywgJ2ZlYXR1cmVfNCcsICdmZWF0dXJlXzYnLCAnZmVhdHVyZV83J10='
### END TESTS

# The learning curve

Okay now that we have gotten rid of all those useless features, let's focus on getting a sense for how much data we need in order to have
reasonable performance.

In [None]:
# Now create a dataframe that has a single feature that is the
# cross validation score in order to help us understand
# how increasing amounts of data affect the performance

# HINT: just use the snippet from the Learning Notebook
train_scores_mean = None
test_scores_mean = None

# instantiate a classifier that you will inspect the learning rate of
clf = DecisionTreeClassifier(max_depth=5, random_state=1)

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
sum(train_scores_mean)

In [None]:
sum(test_scores_mean)

In [None]:
### BEGIN TESTS
# round in order to compensate for implementation details

assert math.isclose(sum(train_scores_mean), 4.5, rel_tol=1e-2)
assert math.isclose(sum(test_scores_mean), 3.8, rel_tol=1e-2)
### END TESTS

In [None]:
learning_curve_df = pd.DataFrame({
    'Training Scores': train_scores_mean,
    'Test Set scores': test_scores_mean
}, index=train_sizes)

learning_curve_df.plot.line(
    title='Decision Tree Learning Curve'
);

In [None]:
# Now select the minimum training set size that this particular classifier
# seems to need before it's learning rate stabilizes

min_train_set_size = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
### BEGIN TESTS
assert min_train_set_size <= int(b64decode(b'MzAw'))
### END TESTS