In [121]:
import numpy as np
import pandas as pd
import json
from anytree.importer import JsonImporter
from anytree import RenderTree
import copy
import time
import datetime
Unseen =[]

listOfUnseen=[]

In [138]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

def convert(d):
    for k, v in d.items():
        return { "name":k, "children": convert_helper(v)}

def convert_helper(d):
    if isinstance(d, dict):
        return [{"name":k, "children":convert_helper(v)} for k, v in d.items()]
    else:
        if d==0.0:
            d = "<=50K"
        else:
            d = ">50K"
        return [{"name": d}]

def DecodedDict():
    Education = {
        0: "Primary",
        1: "Some High School",
        2: "Grad High School",
        3: "Some College",
        4: "Bachelors",
        5: "Masters",
        6: "Doctorate",
        7: "Professor - School",
        8: "Associate - Academia",
        9: "Associate - Voc"
    }

    Gender = {
        0: "Male",
        1: "Female"
    }

    Race = {
        0: "Black",
        1: "Asian-Pacific-Islander",
        2: "Other",
        3: "White",
        4: "American-Indian-Eskimo",
    }

    Occupation = {
        1: "Farm Fishing",
        2: "Tech Support",
        3: "Admin Clerical",
        4: "Handlers Cleaners",
        5: "Professor Speciality",
        6: "Machine-Op Inspector",
        7: "Exec Managerial",
        8: "Private House Server",
        9: "Craft - Repair",
        10: "Sales",
        11: "Transport-Moving",
        12: "Armed Forces",
        13: "Other Services",
        14: "Protective Services"
    }

    Hours = {
        0: "1-9",
        1: "10-19",
        2: "20-29",
        3: "30-39",
        4: "40-49",
        5: "50-59",
        6: "60-69",
        7: "70-79",
        8: "80-89",
        9: "90-99"
    }

    Age = {
        0: "17-19",
        1: "20-29",
        2: "30-39",
        3: "40-49",
        4: "50-59",
        5: "60-69",
        6: "70-79",
        7: "80-90",
    }



    decodes = {
        "Education": Education,
        "Gender": Gender,
        "Race": Race,
        "Occupation": Occupation,
        "Hours": Hours,
        "Age": Age
    }
    return decodes

def SplitData(dataFrame):
    # Shuffles Dataset
    # 60% Training
    # 20% Testing
    # 20% Validation

    trainD, validateD, testD = np.split(dataFrame.sample(frac=1), [int(.6 * len(dataFrame)), int(.8 * len(dataFrame))])
    trainD = trainD.reset_index(drop=True)
    validateD = validateD.reset_index(drop=True)
    testD = testD.reset_index(drop=True)
    return trainD, validateD, testD

def calcEntropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = []
    for i in range(len(elements)):
        c = counts[i]
        total = np.sum(counts)
        frac = c/total
        log = np.log2(frac)
        ent = -frac*log
        entropy.append(ent)

    return np.sum(entropy)

def InfoGain(data, attribute_name, class_name="class"):
    total_entropy = calcEntropy(data[class_name])
    totalSize = data.shape[0]

    feature = data[attribute_name]
    vals, counts = np.unique(feature, return_counts=True)

    Weighted_Entropy = []
    for i in range(len(vals)):
        Class_Col = data.where(feature == vals[i]).dropna()[class_name]
        ent = calcEntropy(Class_Col)
        frac = (counts[i] / np.sum(counts))
        prod = frac * ent
        Weighted_Entropy.append(prod)

    Weighted_Entropy = np.sum(Weighted_Entropy)

    Information_Gain = total_entropy - (1/totalSize) * Weighted_Entropy
    return Information_Gain

def BuildTree(data, originalData, features, target_attribute_name="class", parent_node_class=None, depth=0, max_depth = None):
    Class_Col = data[target_attribute_name]
    Orig_Class_Col = originalData[target_attribute_name]
    depth +=1
    # Stopping Conditions:
    # if there is only one class in the table
    if len(np.unique(Class_Col)) <= 1:
        return np.unique(data[target_attribute_name])[0]

    elif max_depth is not None and depth == max_depth:
        return parent_node_class
        #return np.unique(Orig_Class_Col)[np.argmax(np.unique(Orig_Class_Col, return_counts=True)[1])]

    #if there is no data in the table, return the most common class
    elif len(data) == 0:
        return np.unique(Orig_Class_Col)[np.argmax(np.unique(Orig_Class_Col, return_counts=True)[1])]

    # if all the features are used.
    elif len(features) == 0:
        return parent_node_class

    # Build the tree.
    else:
        # the parent of the feature.
        parent_node_class = np.unique(Class_Col)[np.argmax(np.unique(Class_Col, return_counts=True)[1])]

        # Calculate the info gain for all the features.
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features]

        # Pick the items with the highest info gain (will be used to split the data)
        best_feature_index = np.argmax(item_values)
        root = features[best_feature_index]

        tree = {root: {}}

        # Remove the best feature from the data.
        features = [i for i in features if i != root]

        # Recursively build the tree.
        for v in np.unique(data[root]):
            v = v
            sub_data = data.where(data[root] == v).dropna()
            subtree = BuildTree(sub_data, originalData, features, target_attribute_name, parent_node_class,depth, max_depth)
            tree[root][v] = subtree

        return tree

def TrainingDepth(train_data,feature_list):
    depth_trees = {}
    for i in range(2,8):
        tree = BuildTree(train_data, train_data, feature_list,depth=-1, max_depth=i)
        depth_trees[i] = tree
    return depth_trees

def TrainingFeatures(train_data,feature_list):
    feat_size = len(feature_list)
    feature_set = [feature_list]
    for i in range(feat_size):
        new_feat_set = copy.deepcopy(feature_list)
        new_feat_set.pop(i)
        feature_set.append(new_feat_set)

    feat_dict = {}
    for feat in feature_set:
        feat_dict[tuple(feat)] =  BuildTree(train_data, train_data, feat)

    return feat_dict

def Training(data,feature_list):
    tree = BuildTree(data, data, feature_list)
    return tree

def TreeErrors(depth_trees, valid_data):
    depth_errors ={}
    for k,v in depth_trees.items():
        curr_err =calcErrors(valid_data, v)[0]["Error"]
        depth_errors[k] = curr_err

    return min(depth_errors, key=lambda k1: depth_errors[k1])

def predict(query,tree,default = 0):
    for k in list(query.keys()):
        if k in list(tree.keys()):
            try:
                result = tree[k][query[k]]
                if isinstance(result,dict):
                    return predict(query,result)

                else:
                    return result
            except KeyError:
                Unseen.append(query)
                return default

def testing(data,tree):
    Unseen.clear()
    labels = data.copy().pop(data.columns[-1])
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predictions = pd.DataFrame(columns=["predicted"])

    for i in range(len(data)):
        predictions.loc[i,"predicted"] = predict(queries[i],tree,0.0)
    predictions = predictions.join(labels)
    predictions = predictions.astype(int)
    return predictions

def tree2JSON(tree, fileName):
    with open(fileName+".json", "w") as f:
        json.dump(convert(tree), fp=f, indent=2, cls=NpEncoder)

def treeViewer(fileName, Print=False, ToFile=False):
    importer = JsonImporter()
    with open(fileName+'.json',"r") as js:
        data = js.read()
    root = importer.import_(data)
    if Print:
        if ToFile:
            file = open(fileName+".txt","w+")
            for pre, fill, node in RenderTree(root):
                print("%s%s" % (pre, node.name), file=file)

        else:
             for pre, fill, node in RenderTree(root):
                print("%s%s" % (pre, node.name))
    return root

def confusionMatrix(predicted, actual):
    df_confusion = pd.crosstab(actual, predicted, rownames=['Actual'], colnames=['Predicted'], dropna=False)
    return df_confusion

def calcEvalMetrics(cm):
    TP = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[1][1]
    Total  = TP+FP+FN+TN
    error = (FP +FN)/Total
    accuracy = (TP+TN)/Total
    falseAlarm = FP/(FP+TN)
    miss = FN/(TP+FN)
    recall = TP/(TP+FN)
    precision = TP/(TP+FP)
    metrics =  {"Error":error,
                "Accuracy": accuracy,
                "False Alarm": falseAlarm,
                "Miss": miss,
                "Recall": recall,
                "Precision":precision
                }
    return metrics

def calcErrors(data,tree):
    df = testing(data, tree)
    predicted= df[df.columns[0]].to_numpy()
    actual = df[df.columns[1]].to_numpy()
    cm = confusionMatrix(predicted, actual)
    metrics = calcEvalMetrics(cm)
    return metrics, cm

def PrintMetrics(unseen, metrics, cm, title,File=None, ToFile=False,hyperParams =None):
    mostIssues(unseen)
    if ToFile:
        f = File
        print(title, file=f)
        print("Note: O:  <=50K and 1: >50K \n",file=f)

        if hyperParams is not None:
            print("HyperParameters:",file=f)
            print("Features List:", hyperParams[0],file=f)
            print("Max Depth:", hyperParams[1],file=f)
            print("\n",file=f)

        print("No. of Unseen Data Points: ",len(unseen), "(Default is 0)",file=f)

        print("\n",file=f)
        print("Confusion Matrix:",file=f)
        print(cm,file=f)

        print("\n",file=f)
        print("Metrics:",file=f)
        for k,v in metrics.items():
            print("{0:15} {1}".format(k,v),file=f)
        print("\n", file=f)
    else:
        print(title)
        print("Note: O:  <=50K and 1: >50K \n")

        if hyperParams is not None:
            print("HyperParameters:")
            print("Features List:", hyperParams[0])
            print("Max Depth:", hyperParams[1])
            print("\n")

        print("No. of Unseen Data Points: ",len(unseen), "(Default is 0)")

        print("\n")
        print("Confusion Matrix:")
        print(cm)

        print("\n")
        print("Metrics:")
        for k,v in metrics.items():
            print("{0:15} {1}".format(k,v))
        print("\n")

def PrintResults(test_data, old_tree, new_tree, hyperParams, FileName ="Results", ToFile=False ):
    listOfUnseen.clear()


    if ToFile:
        File = open(FileName+".txt", "w+")
        title1 = "Without HyperParameters Tuning:"
        old_Metrics, old_CM = calcErrors(test_data, old_tree)
        PrintMetrics(Unseen, old_Metrics, old_CM, title1, File, ToFile,hyperParams=(feats, "None"))
        title2 = "With HyperParameters Tuning:"
        new_Metrics, new_CM = calcErrors(test_data, new_tree)
        PrintMetrics(Unseen, new_Metrics, new_CM, title2,File, ToFile, hyperParams=hyperParams)
        File.close()
    else:
        title1 = "Without HypeParameters Tuning:"
        old_Metrics, old_CM = calcErrors(test_data, old_tree)
        PrintMetrics(Unseen, old_Metrics,  old_CM,title1, hyperParams=(feats, "None"))

        title2 = "With HyperParameters Tuning:"
        new_Metrics, new_CM = calcErrors(test_data, new_tree)
        PrintMetrics(Unseen, new_Metrics, new_CM, title2,hyperParams=hyperParams)

def printUnseen(uS):
    stats={}
    for k,v in uS[0].items():
        stats[k]={}


    for l in uS:
        for k,v in l.items():
            key_dict = stats[k]
            if v not in key_dict.keys():
                key_dict[v] = 0
            else:
                key_dict[v] += 1
    return stats

def statsView(stats):
    vals = {}
    for k,v in stats.items():
        #print(k)
        max_key = max(v, key=lambda k2: v[k2])
        for k1,v1 in v.items():
            #print(k1, v1)
            pass
        print(max_key, v[max_key])
        vals[k] = max_key
    print("\n")
    return vals
def mostIssues(unseen):
    stats1 = printUnseen(unseen)
    struggle1 = statsView(stats1)
    print(len(unseen),struggle1)

In [17]:
DataSet = pd.read_csv("income_dataset.csv")
DataSet.rename(columns={"education":"Education","age":"Age","race":"Race","gender":"Gender","occupation":"Occupation","hours per week": "Hours", "income": "class"}, inplace=True)
values = DecodedDict()

for key, value in values.items():
    DataSet[key] = DataSet[key].map(value)

train, validate, test = SplitData(DataSet.copy())
feats = list(train.columns[:-1])


In [18]:
tuningStart_time = time.time()
# Hyperparameter Tuning:
# Find tree that will give lowest error when you remove a single feature from list
feat_trees = TrainingFeatures(train, feats)
best_feat_set = list(TreeErrors(feat_trees, validate))

# Find the best tree that will give the lowest error when changing the max depth the tree can grow to.
depth_Trees = TrainingDepth(train, best_feat_set)
best_depth_tree = TreeErrors(depth_Trees, validate)

# Best Tree
Tree = depth_Trees[best_depth_tree]
tree2JSON(Tree, "Tree(Tuning)")
TreeStruct = treeViewer("Tree(Tuning)",Print=True,ToFile=True)
hyperParamsSet = (best_feat_set, best_depth_tree)
print("Time Taken With Tuning: ","--- %s seconds ---" % (time.time() - tuningStart_time))
print("Time Taken With Tuning: ",str(datetime.timedelta(seconds=(time.time() - tuningStart_time))))


Time Taken With Tuning:  --- 438.8210184574127 seconds ---
Time Taken With Tuning:  0:07:18.821247


In [22]:
print("\n")
Start_time = time.time()
# Tree Without HyperParameter Tuning
oldTree = Training(train, feats)
tree2JSON(oldTree, "Tree")
OldTreeStruct = treeViewer("Tree",Print=True,ToFile=True)
print("Time Taken Without Tuning: ","--- %s seconds ---" % (time.time() - Start_time))
print("Time Taken Without Tuning: ",str(datetime.timedelta(seconds=(time.time() - Start_time))))



Time Taken Without Tuning:  --- 68.07447648048401 seconds ---
Time Taken Without Tuning:  0:01:08.074698


In [139]:
PrintResults(test, oldTree, Tree, hyperParamsSet, ToFile=True)
#PrintResults(test, oldTree, Tree, hyperParamsSet, ToFile=False)





40-49 90
Grad High School 74
Professor Speciality 51
White 233
Male 238
40-49 113


361 {'Age': '40-49', 'Education': 'Grad High School', 'Occupation': 'Professor Speciality', 'Race': 'White', 'Gender': 'Male', 'Hours': '40-49'}
60-69 18
Grad High School 20
Farm Fishing 10
White 33
Male 47
60-69 9


67 {'Age': '60-69', 'Education': 'Grad High School', 'Occupation': 'Farm Fishing', 'Race': 'White', 'Gender': 'Male', 'Hours': '60-69'}


In [133]:
print(train[(train['Occupation']=="Professor Speciality")].count())


Age           2314
Education     2314
Occupation    2314
Race          2314
Gender        2314
Hours         2314
class         2314
dtype: int64
