# 1: Overview Of The Data Set

In [50]:
import pandas as pd
income = pd.read_csv("income - income.csv", index_col=False)
print(income.head(5))

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country high_income  
0          2174             0              40  United-States       <=50K  
1             0             0   

In [51]:
income.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# 2: Converting Categorical Variables

Instructions
Convert the rest of the categorical columns in income (education, marital_status, occupation, relationship, race, sex, native_country, and high_income) to numeric columns.
Hint
You can use a for loop to iterate through the columns and convert the values.

In [52]:
# convert a single column from text categories to numbers
col = pd.Categorical.from_array(income["workclass"])
income["workclass"] = col.codes
print(income["workclass"].head(5))

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8


In [53]:
income["workclass"].head()

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8

In [54]:
for name in["education", "marital_status","occupation", "relationship", "race", "sex","native_country","high_income"]:
    col = pd.Categorical.from_array(income[name])
    income[name] = col.codes
print(col)

[<=50K, <=50K, <=50K, <=50K, <=50K, ..., <=50K, >50K, <=50K, <=50K, >50K]
Length: 32561
Categories (2, object): [<=50K, >50K]


# 3: Creating Splits

In [55]:
private_incomes = income[income["workclass"] == 4]
public_incomes = income[income["workclass"] != 4]

# 4: Overview Of Data Set Entropy

In [56]:
import math
entropy = -(2/5 * math.log(2/5, 2) + 3/5 * math.log(3/5, 2))
print("The entropy is", entropy)

The entropy is 0.9709505944546686


In [57]:
income["high_income"].value_counts()

0    24720
1     7841
Name: high_income, dtype: int64

In [58]:
income_entropy = -(24720/32561 * math.log(24720/32561, 2) + 7841/32561 * math.log(7841/32561, 2))
print("The income entropy is", income_entropy)

The income entropy is 0.7963839552022132


In [59]:
prob_0 = income[income["high_income"] == 0].shape[0] / income.shape[0]
prob_1 = income[income["high_income"] == 1].shape[0] / income.shape[0]

income_entropy = -(prob_0 * math.log(prob_0, 2) + prob_1 * math.log(prob_1, 2))
print("The income entropy is", income_entropy)

The income entropy is 0.7963839552022132


# 5: Information Gain

Instructions
Compute the information gain for splitting on the age column of income.
First, compute the median of age.
Then, assign anything less than or equal to the median to the left branch, and anything greater than the median to the right branch.
Compute the information gain and assign it to age_information_gain.
Hint
The calc_entropy() function accepts either a list or a series.

In [60]:
import numpy as np

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    counts = np.bincount(column)
    probabilities = counts / len(column)
    entropy = 0 
    
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
            
    return -entropy

entropy = calc_entropy([1,1,0,0,1])
print(entropy)

information_gain = entropy - ((.8 * calc_entropy([1,1,0,0]))+ (.2 * calc_entropy([1])))
print("The information gain", information_gain)

0.970950594455
The information gain 0.170950594455


In [61]:
income_entropy = calc_entropy(income["high_income"])

median_age = income["age"].median()

left_split = income[income["age"] <= median_age]
right_split = income[income["age"] > median_age]

age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))
print("The age information gain is", age_information_gain)

The age information gain is 0.0470286613047


# 6: Finding The Best Split

Instructions
Create a list called information_gains.
It should contain, in order, the information gain from splitting on these columns: age, workclass, education_num, marital_status, occupation, relationship, race, sex, hours_per_week, native_country.
Find the highest value in the information_gains list, and assign the corresponding column name to highest_gain.
Hint
Follow the steps we completed on the last screen to calculate information gain.

In [62]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    orginal_entropy = calc_entropy(data[target_name])
    column = data[split_name]
    median = column.median()
    left_split = data[column <= median]
    right_split = data[column > median]
    
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0])
        to_subtract += prob * calc_entropy(subset[target_name])
        return orginal_entropy - to_subtract
    
print(calc_information_gain(income, "age", "high_income"))

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

information_gains = []

for col in columns:
    information_gain = calc_information_gain(income, col, "high_income")
    information_gains.append(information_gain)

highest_gain_index = information_gains.index(max(information_gains))
highest_gain = columns[highest_gain_index]

0.503201263845


# Building a Decision Tree

# 1: Determining The Column To Split On

Instructions
Write a function named find_best_column() that returns the name of a column to split the data on. We've started to define this function for you.
Use find_best_column() to find the best column on which to split income.
The target is the high_income column, and the potential columns to split with are in the list columns below.
Assign the result to income_split.
Hint
You can loop through each column in columns, compute information gain, and then return the name of the highest-scoring column.

In [63]:
def find_best_column(data, target_name, columns):
    information_gains = []
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)
        highest_gain_index = information_gains.index(max(information_gains))
        highest_gain = columns[highest_gain_index]
        return highest_gain

income_split = find_best_column(income, "highest_income", columns)
print("The best column to split the data on is", income_split)

The best column to split the data on is age


# 2: Creating A Simple Recursive Algorithm

In [64]:
label_1s = []
label_0s = []

def id3(data, target, columns):
    unique_targets = pd.unique(data[target])
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            label_0s.append(0)
        
        elif 1 in unique_targets:
            label_1s.append(1)
        
        return 
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    for split in [left_split, right_split]:
        id3(split, target, columns)
        
data = pd.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])

data.columns = ["high_income", "age", "marital_status"]
id3(data, "high_income", ["age", "marital_status"])
print(label_1s)
print(label_0s)
print(data)
print(columns)

[1, 1, 1]
[0, 0, 0]
   high_income  age  marital_status
0            0   20               0
1            0   60               2
2            0   40               1
3            1   25               1
4            1   35               2
5            1   55               1
['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']


# 3: Storing The Tree

In [65]:
tree = {}

nodes = []

def id3(data, target, columns, tree):
    unique_targets = pd.unique(data[target])
    
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]
    
    if len(unique_targets) == 1:
        tree["number"] = nodes[-1]
        if len(unique_targets) == 1:
            if 0 in unique_targets:
                tree["label"] = 0
            elif 1 in unique_targets:
                tree["label"] = 1
        return 
                        
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
            tree[name] = {}
            id3(split, target, columns, tree[name])
            
id3(data, "high_income", ["age", "marital_status"], tree)
print(tree)
print(nodes)
print(columns)
print(data)
print(name)

{'left': {'left': {'left': {'number': 4, 'label': 0}, 'column': 'age', 'number': 3, 'right': {'number': 5, 'label': 1}, 'median': 22.5}, 'column': 'age', 'number': 2, 'right': {'number': 6, 'label': 1}, 'median': 25.0}, 'column': 'age', 'number': 1, 'right': {'left': {'left': {'number': 9, 'label': 0}, 'column': 'age', 'number': 8, 'right': {'number': 10, 'label': 1}, 'median': 47.5}, 'column': 'age', 'number': 7, 'right': {'number': 11, 'label': 0}, 'median': 55.0}, 'median': 37.5}
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']
   high_income  age  marital_status
0            0   20               0
1            0   60               2
2            0   40               1
3            1   25               1
4            1   35               2
5            1   55               1
high_income


# 4: Printing Labels For A More Attractive Tree

Instructions
Fill in the gaps in the print_node() function that say "Insert code here...".
Your code should iterate through both branches of the branches list (in order), and recursively call print_node().
Don't forget to increment depth when you call print_node.
Call print_node(), and pass in tree and depth 0.
Hint
You can create a for loop to iterate through the branches list, and call print_node() for each item in the list.

In [66]:
def print_with_depth(string, depth):
    prefix = "   " * depth
    print("{0}{1}".format(prefix, string))
    
def print_node(tree, depth):
    if "label" in tree:
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        return
    
    print_with_depth("{0} > {1}".format(tree["column"],tree["median"]), depth)
    branches = [tree["left"], tree["right"]]
    
    for b in branches:
        print_node(b, depth+1)
        
print_node(tree, 0)        

age > 37.5
   age > 25.0
      age > 22.5
         Leaf: Label 0
         Leaf: Label 1
      Leaf: Label 1
   age > 55.0
      age > 47.5
         Leaf: Label 0
         Leaf: Label 1
      Leaf: Label 0


# 5: Making Predictions Automatically

Instructions
Fill in the gaps in the predict() function that say "Insert code here...".
The code should check whether row[column] is less than or equal to median, and return the appropriate result for each side of the tree.
Print the result of predicting the first row of the data with predict(tree, data.iloc[0]).
Hint
Remember to use the return statement to return a result!

In [67]:
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    
    column = tree["column"]
    median = tree["median"]
    
    if row[column] <= median:
        return predict(tree["left"], row)
    else:
        return predict(tree["right"], row)
    
print(predict(tree, data.iloc[0]))
        
   
    
print(predict(tree, data.iloc[0]))
print(tree)
print(data)

0
0
{'left': {'left': {'left': {'number': 4, 'label': 0}, 'column': 'age', 'number': 3, 'right': {'number': 5, 'label': 1}, 'median': 22.5}, 'column': 'age', 'number': 2, 'right': {'number': 6, 'label': 1}, 'median': 25.0}, 'column': 'age', 'number': 1, 'right': {'left': {'left': {'number': 9, 'label': 0}, 'column': 'age', 'number': 8, 'right': {'number': 10, 'label': 1}, 'median': 47.5}, 'column': 'age', 'number': 7, 'right': {'number': 11, 'label': 0}, 'median': 55.0}, 'median': 37.5}
   high_income  age  marital_status
0            0   20               0
1            0   60               2
2            0   40               1
3            1   25               1
4            1   35               2
5            1   55               1


# 6: Making Multiple Predictions

Instructions
Create a function named batch_predict() that takes two parameters, tree and df.
It should use the apply() method to apply the predict() function across each row of df.
You can use lambda functions to pass tree and row into predict.
Call batch_predict() with new_data as the parameter df, and assign the result to predictions.
Hint
You can achieve the result you want with df.apply(lambda x: predict(tree, x), axis=1).

In [68]:
new_data = pd.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])

new_data.columns = ["age", "marital_status"]

def batch_predict(tree, df):
    return new_data.apply(lambda x: predict(tree, x), axis=1)
    
predictions = batch_predict(tree, new_data)
print(predictions)

0    0
1    0
2    0
3    0
4    1
5    0
dtype: int64


# Applying Decision Trees

# 1: Using Decision Trees With Scikit-Learn

Instructions
Fit clf to the income data.
Pass in income[columns] so that we only use the named columns as predictors.
The target is the high_income column.
Hint
You can use clf.fit to fit the model to the data. The first parameter is the predictors, and the second is the target.

In [69]:
from sklearn.tree import DecisionTreeClassifier
column = ["age", "workclass", "education_num", "maritial_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
clf = DecisionTreeClassifier(random_state=1)
clf.fit(income[columns], income["high_income"])
print(columns)

['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']


# 2: Splitting The Data Into Train And Test Sets

Instructions
All of the rows in income with a position up to train_max_row (but not including it) will be part of the training set.
Make a new dataframe called train containing all of these rows.
Make a dataframe called test containing all of the rows with a position greater than or equal to train_max_row.
Hint
To select elements by position, use the .iloc[] method on dataframes. income.iloc[0:10] will select the first 10 rows of the dataframe (from position 0 up to, but not including, position 10).

In [70]:
import numpy as np
import math
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))
train_max_row = math.floor(income.shape[0] * .8)

train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

# 3: Evaluating Error With AUC

In [71]:
from sklearn.metrics import roc_auc_score
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
error = roc_auc_score(test["high_income"], predictions)
print(error)
print(predictions)
print(columns)
print(clf)

0.693465632475
[0 0 0 ..., 0 1 1]
['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')


# 4: Computing Error On The Training Set

Instructions
Print out the AUC score between predictions and the high_income column of train.
Hint
Pass in the series object representing the high_income column from predictions as the first parameter, and predictions as the second parameter.

In [72]:
predictions = clf.predict(train[columns])
AUC_train = roc_auc_score(train["high_income"], predictions)
print(AUC_train)
print(predictions)
print(columns)
print(clf)

0.947124450144
[0 0 1 ..., 0 1 0]
['age', 'workclass', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'native_country']
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')


# 5: Reducing Overfitting With A Shallower Tree

In [73]:
clf = DecisionTreeClassifier(min_samples_split=13, random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print(test_auc)
print(train_auc)

0.699561714515
0.842143184928


# 6: Tweaking Parameters To Adjust AUC

Instructions
Set max_depth to 7 and min_samples_split to 13 when creating the DecisionTreeClassifier.
Make predictions on the training set, compute the AUC, and assign it to train_auc.
Make predictions on the test set, compute the AUC, and assign it to test_auc.
Hint
Remember to create the classifier, then fit it to the data.

In [74]:
clf = DecisionTreeClassifier(random_state=1, max_depth=7, min_samples_split=13)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print(test_auc)
print(train_auc)

0.743634499673
0.748037708309


# 7: Tweaking Tree Depth To Adjust AUC

Instructions
Set max_depth to 2 and min_samples_split to 100 when creating the DecisionTreeClassifier.
Make predictions on the training set, compute the AUC, and assign it to train_auc.
Make predictions on the test set, compute the AUC, and assign it to test_auc.
Hint
Remember to create the classifier, then fit it to the data.

In [75]:
clf = DecisionTreeClassifier(random_state=1, max_depth=2, min_samples_split=100)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print(test_auc)
print(train_auc)

0.655313848188
0.662450804216


# 8: Exploring Decision Tree Variance

In [76]:
np.random.seed(1)
income["noise"] = np.random.randint(4, size=income.shape[0])
columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)

print(test_auc)
print(train_auc)

0.691406001394
0.975076161435


# Introduction to Random Forests

# 1: Combining Model Predictions With Ensembles

Instructions
Fit both clf and clf2 to the data.
Use train[columns] as the predictors, and train["high_income"] as the target.
Make predictions on the test set predictors (test[columns]) using both clf and clf2.
For both sets of predictions, compute the AUC between the predictions and the actual values (test["high_income"]) using the roc_auc_score function.
Use the print() function to display the AUC values for both.
Hint
Use the .fit() method to fit a classifier.
Use the .predict() method to make predictions.

In [77]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf2.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

predictions = clf2.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.683783737734
0.675985390651


# 2: Combining Our Predictions

# Instructions
Add predictions and predictions2, then divide by 2 to get the mean.
Use numpy.round() to round all of the resulting predictions.
Print the resulting AUC score between the actual values and the predictions.
Hint
Use roc_auc_score to get the AUC.

In [78]:
import numpy as np

predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf.predict_proba(test[columns])[:,1]

combined = (predictions + predictions2) / 2
print(combined)
rounded = np.round(combined)
print(rounded)

print(roc_auc_score(test["high_income"], rounded))

[ 0.66666667  0.          0.         ...,  0.33333333  1.          0.5       ]
[ 1.  0.  0. ...,  0.  1.  0.]
0.683783737734


# 3: Introducing Variation With Bagging

In [79]:
tree_count = 10
bag_proportion = .6

predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag["high_income"])
    predictions.append(clf.predict_proba(test[columns])[:,1])
    combined = np.sum(predictions, axis=0) / 10
    rounded = np.round(combined)
    print(roc_auc_score(test["high_income"], rounded))

0.5
0.5
0.5
0.5
0.5
0.577421394506
0.645978545259
0.687432441845
0.711286703152
0.731580458661


# 4: Selecting Random Features

Instructions
Modify find_best_column to select a random sample from columns before computing information gain.
Look where it says Insert code here.
Each subset will have 2 items in it.
You can use numpy.random.choice() to select a random sample.
The first input is the list you're picking from, and the second is the number of items you want to pick.
Be careful not to overwrite columns when you do the selection.
The random sample should go in a different variable, and you'll have to modify some of the surrounding code to match.
Use the print() function to display tree.
Hint
Use numpy.random.choice(columns, 2) to select two columns randomly.

In [95]:
data = pd.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])

data.columns = ["high_income", "employment", "age", "marital_status"]

np.random.seed(1)

tree = {}
nodes = []

def find_best_column(data, target_name, columns):
    information_gains = []
    cols = np.random.choice(columns, 2)
    
    for col in cols:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)
        
        highest_gain_index = information_gains.index(max(information_gains))
        highest_gain = cols[highest_gain_index]
        return highest_gain
    
def id3(data, target, columns, tree):
    unique_targets = pd.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
        
        best_column = find_best_column(data, target, columns)
        column_median = data[best_column].median()
        
        tree["column"] = best_column
        tree["median"] = column_median
        
        left_split = data[data[best_column] <= column_median]
        right_split = data[data[best_column] > column_median]
        split_dict = [["left", left_split], ["right", right_split]]
        
        for name, split in split_dict:
            tree[name] = {}
            id3(split, target, columns, tree[name])
    
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

{'number': 1}


In [98]:
data = pd.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])
print(data)

data.columns = ["high_income", "employment", "age", "marital_status"]

np.random.seed(1)

tree = {}
nodes = []

def find_best_column(data, target_name, columns):
    information_gains = []
    
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)
        
        highest_gain_index = information_gains.index(max(information_gains))
        highest_gain = columns[highest_gain_index]
        return highest_gain
    
def id3(data, target, columns, tree):
    unique_targets = pd.unique(data[target])
    nodes.append(len(nodes) + 1)
    
    tree["number"] = nodes[-1]
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])
        
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

def find_best_column(data, target_name, columns):
    information_gains = []
    
    cols = np.random.choice(columns, 2)
    for col in cols:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)
        
        highest_gain_index = information_gains.index(max(information_gains))
        highest_gain = cols[highest_gain_index]
        
        return highest_gain
    
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

   0  1   2  3
0  0  4  20  0
1  0  4  60  2
2  0  5  40  1
3  1  4  25  1
4  1  5  35  2
5  1  5  55  1


RecursionError: maximum recursion depth exceeded in comparison

# 5: Random Subsets In Scikit-Learn

Instructions
Modify the instantiation of the DecisionTreeClassifier object.
Set splitter to "random", and max_features to "auto".
Print the resulting AUC score.
Hint
Refer to the documentation for DecistionTreeClassifier if you need to.

In [102]:
tree_count = 10

bag_proportion = .6

predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    clf = DecisionTreeClassifier(splitter="random",max_features="auto",random_state=1, min_samples_leaf=2)
    clf.fit(bag[columns], bag["high_income"])
    
    predictions.append(clf.predict_proba(test[columns])[:,1])
    
combined = np.sum(predictions, axis=0) / 10
rounded = np.round(combined)

print(roc_auc_score(test["high_income"], rounded))

0.73165500127


# 6: Practice Putting It All Together

In [104]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.734724214897


# 7: Tweaking Parameters To Increase Accuracy

Instructions
Increase n_estimators to 150.
Hint
Just change the number when instantiating DecisionTreeClassifier.



In [105]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.742327748943


# 8: Reducing Overfitting

Instructions
Fit clf to the training set and use it to make predictions on the training set.
Then, use it to make predictions on the testing set.
Print both AUC scores.
Hint
Refer to the documentation for RandomForestClassifier if you need to.

In [107]:
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(train[columns])
print(roc_auc_score(train["high_income"], predictions))

predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.827539024247
0.718819578322
0.741439940206
