In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
import math
from pprint import pprint

data = pd.read_csv("adult.data", names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
                                        "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", 
                                        "native-country", "income"], delimiter = ",", skipinitialspace=True)

#Check the data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [2]:
# handle missing values
data = data.dropna()
#Drop for duplicate values]
data.drop_duplicates(inplace=True)
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### Test out train_test_split to see if proportions and randomization is correct

In [3]:
def train_test_split(dataset, test_size):
    if isinstance(test_size,float):
        test_size = round(test_size * len(dataset))
    
    indices = dataset.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)
    
    test = dataset.loc[test_indices]
    train = dataset.drop(test_indices)
    
    return train, test

In [4]:
random.seed(42)
train_data, test_data =  train_test_split(data, test_size=0.4)

In [5]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K


In [6]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
20961,29,Private,663394,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Black,Male,0,0,40,United-States,<=50K
3648,26,Federal-gov,48099,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K
819,47,Private,191277,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,>50K
24314,65,Private,398001,HS-grad,9,Widowed,Adm-clerical,Not-in-family,White,Female,0,0,20,United-States,<=50K
9014,62,Private,134768,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,>50K


In [7]:
#number of test data
print(len(test_data))

#number of train data
print(len(train_data))

13015
19522


In [8]:
#check the income attribute has the correct proportion size
np.unique(test_data.income, return_counts=True)

(array(['<=50K', '>50K'], dtype=object), array([9868, 3147], dtype=int64))

In [9]:
len(data)

32537

### Function for calculating information gain (entropy)

In [10]:
### Function for calculating information gain (entropy)# calculate entropy for the dataset (For testing/verification)
label_col  = train_data.iloc[:, -1]
_, n_counts = np.unique(label_col, return_counts=True)

prob = n_counts / n_counts.sum()
entropy_of_dataset = sum(prob * -np.log2(prob))

print("Entropy of dataset: ", entropy_of_dataset)

Entropy of dataset:  0.7956122484302881


In [11]:
def entropy(target_col):
    
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [12]:
def InfoGain(data, split_attribute_name, target_name="income"):
    
    total_entropy = entropy(data[target_name])
    
    # check for missing values
    if data[split_attribute_name].isnull().sum() > 0:
        return -1
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    
    # check for uniqueness
    if len(counts) == 1:
        return -1
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    
    return Information_Gain

In [13]:
ig = InfoGain(data, 'age')
print(ig)

0.09916806704564907


### Function for calculating gain ratio

In [14]:
def GainRatio(data,split_attribute_name,target_name="income"):
    
    # Calculate the InfoGain for the given data and split attribute.
    info_gain = InfoGain(data,split_attribute_name,target_name)
    
    # Calculate the Intrinsic Information for the given data and split attribute.
    intrinsic_info = -(data[split_attribute_name].value_counts()/data.shape[0])*np.log2(data[split_attribute_name].value_counts()/data.shape[0])
    intrinsic_info = intrinsic_info.sum()
    
    # If the Intrinsic Information is 0, set the Gain Ratio to 0
    if intrinsic_info == 0:
        gain_ratio = 0
    else:
        gain_ratio = info_gain/intrinsic_info
    
    return gain_ratio

In [15]:
gr = GainRatio(data, 'age')
print(gr)

0.017448744667473145


### Function for calculating gini index

In [16]:
def GiniIndex(data,split_attribute_name,target_name="income"):
    
    gini = 1-((data[target_name].value_counts()/data.shape[0])**2).sum()
    
    return gini

In [17]:
gi = GiniIndex(data, 'age')
print(gi)

0.36576103005525873


### Function for calculating best split based on three models

The function checks the value of split_criteria and determines the best feature to split on based on that value. If the value is "info_gain", the best feature is determined by finding the maximum value of the information gain using the InfoGain function and the max function along with a lambda function that calculates the information gain for each feature. If the value is "Gain_ratio", the best feature is determined in the same way, but using the GainRatio function instead. If the value is "gini_index", the best feature is determined using the GiniIndex function in the same way.

In [18]:
### Function for calculating gini index# Function to decide the best feature to split on
def best_feature_to_split(data,split_criteria,target_name="income"):
    
    # Get a list of all feature names
    feature_names = list(data.columns)
    
    # Remove the target name from the list of feature names
    feature_names.remove(target_name)
    
    # Check the split criteria and determine the best feature accordingly
    if split_criteria == "info_gain":
        best_feature = max(feature_names, key=lambda x: InfoGain(data, x, target_name))
    elif split_criteria == "gain_ratio":
        best_feature = max(feature_names, key=lambda x: GainRatio(data, x, target_name))
    elif split_criteria == "gini_index":
        best_feature = max(feature_names, key=lambda x: GiniIndex(data, x, target_name))
    
    return best_feature

In [19]:
#calculate split info for one feature
split_info = 0
    
unique_vals, n_counts = np.unique(train_data['income'], return_counts=True)
prob = n_counts / n_counts.sum()
    
print(unique_vals)
print(n_counts)
print(n_counts.sum())
    
for idx, values in enumerate(unique_vals):
    
    occurences = n_counts[idx]
    prob = occurences/n_counts.sum()
    temp = (prob* - np.log2(prob))
    split_info = split_info + temp
    
print(split_info)

['<=50K' '>50K']
[14830  4692]
19522
0.7956122484302881


### Function for calculating tree

If all the observations have the same label, the function returns that label.
If there are no more features to split on, the function returns the most common label in the data.
If neither of the base cases are met, the function determines the best feature to split on based on the input split criteria (info_gain, Gain_ratio, or gini_index

In [20]:
# Function to build the decision tree
def build_tree(data, split_criteria, target_name="income"):
    
    # Base case: if all the observations have the same label, return that label
    if len(np.unique(data[target_name])) <= 1:
        return np.unique(data[target_name])[0]
    
    # If there are no more features to split on, return the most common label
    elif len(data.columns) == 1:
        #return data[target_name].value_counts().idxmax()
        return data[target_name].mode()[0]
    else:
        # Choose the feature to split on
        best_feature = best_feature_to_split(data, split_criteria, target_name)
        tree = {best_feature:{}}
        
        # Recursively build the tree
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value].drop(best_feature, axis=1)
            subtree = build_tree(sub_data, split_criteria, target_name)
            tree[best_feature][value] = subtree
    return tree

### Function for checking if we have reached a leaf node

In [21]:
def check_leaf(data):
    ##store last column as label_col
    label_col = data[:,-1]
    
    #use np.unique() to extract out unique values from label_col
    classes = np.unique(label_col)

    if len(classes) == 1:
        return True
    else:
        return False;

In [22]:
np.unique(test_data.income, return_counts=True)

(array(['<=50K', '>50K'], dtype=object), array([9868, 3147], dtype=int64))

In [23]:
check_leaf(train_data.values)

False

In [24]:
data_copy = train_data.values
data_copy[:5]

array([[39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married',
        'Adm-clerical', 'Not-in-family', 'White', 'Male', 2174, 0, 40,
        'United-States', '<=50K'],
       [50, 'Self-emp-not-inc', 83311, 'Bachelors', 13,
        'Married-civ-spouse', 'Exec-managerial', 'Husband', 'White',
        'Male', 0, 0, 13, 'United-States', '<=50K'],
       [28, 'Private', 338409, 'Bachelors', 13, 'Married-civ-spouse',
        'Prof-specialty', 'Wife', 'Black', 'Female', 0, 0, 40, 'Cuba',
        '<=50K'],
       [52, 'Self-emp-not-inc', 209642, 'HS-grad', 9,
        'Married-civ-spouse', 'Exec-managerial', 'Husband', 'White',
        'Male', 0, 0, 45, 'United-States', '>50K'],
       [31, 'Private', 45781, 'Masters', 14, 'Never-married',
        'Prof-specialty', 'Not-in-family', 'White', 'Female', 14084, 0,
        50, 'United-States', '>50K']], dtype=object)

In [25]:
check_leaf(train_data[train_data.income=='priority'].values)

False

### Function for classification

In [26]:
def classify_data(data):
    
    ##since our labels are always in the last column
    label_col = data[:,-1]
    
    #find each unique classes in the label column and how many times it appears
    classes, n_classes = np.unique(label_col, return_counts=True)

    #find index of the most frequent class
    index = n_classes.argmax()
    
    #extract out the name
    classification = classes[index]
    
    return classification

In [27]:
classify_data(train_data.values)

'<=50K'

### Test out all the three models

In [28]:
#Build decision tree using information gain
info_gain_tree = build_tree(train_data, "info_gain")
print("Decision Tree using Information Gain:",info_gain_tree)

Decision Tree using Information Gain: {'fnlwgt': {12285: '<=50K', 13769: '<=50K', 14878: '>50K', 19302: {'age': {30: '<=50K', 31: '>50K', 32: '>50K'}}, 19847: '<=50K', 19899: '>50K', 19914: '<=50K', 20057: '<=50K', 20101: '<=50K', 20296: '>50K', 20308: '<=50K', 20333: '>50K', 20438: '<=50K', 20469: '<=50K', 20511: '<=50K', 20534: '>50K', 20795: {'age': {50: '>50K', 51: '<=50K'}}, 20809: '>50K', 20953: '>50K', 20956: {'age': {46: '>50K', 47: '<=50K'}}, 21095: '<=50K', 21154: '<=50K', 21472: '<=50K', 21626: '<=50K', 21792: '<=50K', 21856: '<=50K', 21876: '<=50K', 22055: '<=50K', 22186: '<=50K', 22201: {'age': {24: '<=50K', 31: '>50K', 37: '>50K', 41: '>50K'}}, 22211: '>50K', 22245: {'education': {'Assoc-voc': '<=50K', 'Bachelors': '<=50K', 'Doctorate': '>50K', 'HS-grad': '>50K', 'Some-college': '<=50K'}}, 22313: '<=50K', 22328: '>50K', 22418: '<=50K', 22422: '<=50K', 22428: {'age': {50: '<=50K', 52: '>50K'}}, 22463: {'age': {35: '<=50K', 37: '>50K'}}, 22494: '<=50K', 22546: '<=50K', 2264

In [29]:
#Build decision tree using gain ratio
gain_ratio_tree = build_tree(train_data, "gain_ratio")
print("Decision Tree using Gain Ratio:",gain_ratio_tree)

Decision Tree using Gain Ratio: {'capital-gain': {0: {'capital-loss': {0: {'marital-status': {'Divorced': {'fnlwgt': {19847: '<=50K', 20296: '>50K', 20809: '>50K', 21095: '<=50K', 21626: '<=50K', 21876: '<=50K', 22245: '<=50K', 22418: '<=50K', 23580: '<=50K', 23789: '<=50K', 23871: '<=50K', 23892: '<=50K', 24215: '<=50K', 24264: '<=50K', 24562: '<=50K', 24896: '<=50K', 26254: '<=50K', 26401: '<=50K', 26669: '<=50K', 26950: '<=50K', 26987: '<=50K', 26994: '<=50K', 27067: '<=50K', 27494: '>50K', 27886: '<=50K', 28151: '>50K', 28291: '<=50K', 28375: '<=50K', 28568: '>50K', 28572: '<=50K', 28735: '<=50K', 28738: '<=50K', 28791: '<=50K', 29320: '>50K', 29430: '<=50K', 29557: '<=50K', 29617: '<=50K', 29762: '<=50K', 29810: '<=50K', 29865: '<=50K', 29887: '<=50K', 30063: '<=50K', 30126: '<=50K', 30226: '<=50K', 30447: '<=50K', 30499: '<=50K', 30509: '<=50K', 30673: '>50K', 30751: '<=50K', 30840: '>50K', 31053: '<=50K', 31195: '<=50K', 31352: '>50K', 31438: '<=50K', 31449: '<=50K', 31460: '<=5

In [30]:
#Build decision tree using Gini Index
gini_index_tree = build_tree(train_data, "gini_index")
print("Decision Tree using Gini Index:",gini_index_tree)

Decision Tree using Gini Index: {'age': {17: '<=50K', 18: '<=50K', 19: {'workclass': {'?': {'fnlwgt': {20469: '<=50K', 26620: '<=50K', 32477: '<=50K', 33487: '<=50K', 35507: '<=50K', 37332: '<=50K', 43739: '<=50K', 47713: '<=50K', 52114: '<=50K', 60688: '<=50K', 62534: '<=50K', 71592: '<=50K', 80710: '<=50K', 109938: '<=50K', 112780: '<=50K', 113915: '<=50K', 117201: '<=50K', 128453: '<=50K', 129586: '<=50K', 131982: '<=50K', 133983: '<=50K', 134974: '<=50K', 137578: '<=50K', 140399: '<=50K', 140590: '<=50K', 141418: '<=50K', 143867: '<=50K', 158603: '<=50K', 169324: '<=50K', 169758: '<=50K', 170653: '<=50K', 174233: '<=50K', 174871: '<=50K', 182590: '<=50K', 185619: '<=50K', 192773: '<=50K', 194095: '<=50K', 195282: '<=50K', 199495: '<=50K', 199609: '<=50K', 200790: '>50K', 204441: '<=50K', 204868: '<=50K', 208874: '<=50K', 217194: '<=50K', 220517: '<=50K', 225775: '<=50K', 230874: '<=50K', 233779: '<=50K', 242001: '<=50K', 249147: '<=50K', 252752: '<=50K', 257343: '<=50K', 278220: '<

### Function for predicting values and calculating accuracy

In [31]:
def predict(tree, observation):
    
    # If the current node of the tree is a string, return it
    if type(tree) is str:
        return tree
    else:
        # Get the value of the feature in the observation
        feature = next(iter(tree))
        # Recursively call the function with the whole tree and the observation
        if feature not in observation:
            return predict(tree[feature], observation)
        else:
            feature_value = observation[feature]
            if feature_value not in tree[feature]:
                return predict(tree[feature], observation)
            else:
                # Get the subtree corresponding to the feature value
                subtree = tree[feature][feature_value]
                return predict(subtree, observation)

In [32]:
def accuracy(tree, test_data, target_name="income"):
    
    count = 0
    for index, row in test_data.iterrows():
        if predict(tree, row.to_dict()) == ensemble_predict(dt1, dt2, dt3, row.to_dict()):
            count += 1
    
    acc = count/len(test_data)
    return acc

The ensemble_predict function takes as input the three trained decision tree models, and an observation. It predicts the target label for the observation by applying the predict function on each of the decision trees and passing the observation as input.

In [33]:
# Train the three decision tree models
dt1 = build_tree(train_data, "info_gain")
dt2 = build_tree(train_data, "gain_ratio")
dt3 = build_tree(train_data, "gini_index")

# Define the ensemble_predict function
def ensemble_predict(dt1, dt2, dt3, observation):
    prediction1 = predict(dt1, observation)
    prediction2 = predict(dt2, observation)
    prediction3 = predict(dt3, observation)
    
    # Create a list of predictions
    predictions = [prediction1, prediction2, prediction3]
    
    # Use a majority voting function to determine the final prediction
    return max(set(predictions), key = predictions.count)

In [34]:
# Make a prediction for a new observation
new_observation = {"age": 20, "workclass": "Private", "fnlwgt": 266015, "education": "Some-college", "education-num": 10, "marital-status": "Never-married", "occupation": "Sales", "relationship": "Own-child", "race": "Black", "sex": "Male", "capital-gain": 0, "capital-loss": 0, "hours-per-week": 44, "native-country": "United-States"}
ensemble_prediction = ensemble_predict(dt1, dt2, dt3, new_observation)
print("Ensemble Prediction: ", ensemble_prediction)

Ensemble Prediction:  <=50K


In [35]:
ensemble_acc1 = accuracy(dt1, test_data)
ensemble_acc2 = accuracy(dt2, test_data)
ensemble_acc3 = accuracy(dt3, test_data)

print("Ensemble Accuracy for info_gain: ", ensemble_acc1)
print("Ensemble Accuracy for gain_ratio: ", ensemble_acc2)
print("Ensemble Accuracy for gini_index: ", ensemble_acc3)

Ensemble Accuracy for info_gain:  0.9446023818670765
Ensemble Accuracy for gain_ratio:  0.9293891663465232
Ensemble Accuracy for gini_index:  0.7466769112562428


In general, the accuracy for info_gain and gain_ratio has a relative high accuracy. However, the accuracy for gini_index is low with 0.747.
To improve the accuracy, collect more training data: having more training data can help the models to learn more complex relationships between the features and the target. Use a different model: decision trees are good models, but they are not the best models for all datasets. For example, support vector machines, neural networks, and random forests are often better models.