In [6]:
# Unit Tests for knn and decision trees (find testing for RF's in random_forest.ipynb):

import numpy as np
import scipy.stats as stats
import random
import math

from mysklearn.myclassifiers import MyDecisionTreeClassifier
import mysklearn.myutils as myutils
    
def test_decision_tree_classifier_fit():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [
        ["Senior", "Java", "no", "no", "False"],
        ["Senior", "Java", "no", "yes", "False"],
        ["Mid", "Python", "no", "no", "True"],
        ["Junior", "Python", "no", "no", "True"],
        ["Junior", "R", "yes", "no", "True"],
        ["Junior", "R", "yes", "yes", "False"],
        ["Mid", "R", "yes", "yes", "True"],
        ["Senior", "Python", "no", "no", "False"],
        ["Senior", "R", "yes", "no", "True"],
        ["Junior", "Python", "yes", "no", "True"],
        ["Senior", "Python", "yes", "yes", "True"],
        ["Mid", "Python", "no", "yes", "True"],
        ["Mid", "Java", "yes", "no", "True"],
        ["Junior", "Python", "no", "yes", "False"]
    ]
    
    interview_tree = \
        ["Attribute", "att0",
            ["Value", "Junior", 
                ["Attribute", "att3",
                    ["Value", "no", 
                        ["Leaf", "True", 3, 5]
                    ],
                    ["Value", "yes", 
                        ["Leaf", "False", 2, 5]
                    ]
                ]
            ],
            ["Value", "Mid",
                ["Leaf", "True", 4, 14]
            ],
            ["Value", "Senior",
                ["Attribute", "att2",
                    ["Value", "no",
                        ["Leaf", "False", 3, 5]
                    ],
                    ["Value", "yes",
                        ["Leaf", "True", 2, 5]
                    ]
                ]
            ]
        ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(interview_table, interview_header, "interviewed_well")
    interview_table = myutils.drop_column(interview_table, interview_header, "interviewed_well")
    X_train = interview_table
    my_dt.fit(X_train, y_train)
    
    assert myutils.equivalent(my_dt.tree, interview_tree) # Above this function
    
    # bramer degrees dataset
    degrees_header = ["SoftEng", "ARIN", "HCI", "CSA", "Project", "Class"]
    degrees_table = [
        ["A", "B", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "A", "B", "B", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "A", "A", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "A", "B", "FIRST"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "B", "B", "SECOND"],
        ["B", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "B", "A", "A", "FIRST"],
        ["B", "B", "B", "A", "A", "SECOND"],
        ["B", "B", "A", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["B", "A", "B", "A", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "B", "A", "B", "B", "SECOND"],
        ["B", "A", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
    ]

    # Computed using entropy; This won't work until this is implemented
    # This took me an hour, but near the end it got easy. I'm glad computers exist.
    degrees_tree = \
        ["Attribute", "att0",
            ["Value", "A",
                ["Attribute", "att4",
                    ["Value", "A",
                        ["Leaf", "FIRST", 5, 14]
                    ],
                    ["Value", "B",
                        ["Attribute", "att3",
                            ["Value", "A", 
                                ["Attribute", "att1", 
                                    ["Value", "A", 
                                        ["Leaf", "FIRST", 1, 2]
                                    ],
                                    ["Value", "B",
                                        ["Leaf", "SECOND", 1, 2]
                                    ]
                                ]
                            ],
                            ["Value", "B",
                                ["Leaf", "SECOND", 7, 9]
                            ]
                        ]
                    ]
                ]
            ],
            ["Value", "B",
                ["Leaf", "SECOND", 12, 26]
            ]
        ]
    
    # Same thing this time
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(degrees_table, degrees_header, "Class")
    degrees_table = myutils.drop_column(degrees_table, degrees_header, "Class")
    X_train = degrees_table
    my_dt.fit(X_train, y_train)
    
    assert myutils.equivalent(my_dt.tree, degrees_tree)

def test_decision_tree_classifier_predict():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [
        ["Senior", "Java", "no", "no", "False"],
        ["Senior", "Java", "no", "yes", "False"],
        ["Mid", "Python", "no", "no", "True"],
        ["Junior", "Python", "no", "no", "True"],
        ["Junior", "R", "yes", "no", "True"],
        ["Junior", "R", "yes", "yes", "False"],
        ["Mid", "R", "yes", "yes", "True"],
        ["Senior", "Python", "no", "no", "False"],
        ["Senior", "R", "yes", "no", "True"],
        ["Junior", "Python", "yes", "no", "True"],
        ["Senior", "Python", "yes", "yes", "True"],
        ["Mid", "Python", "no", "yes", "True"],
        ["Mid", "Java", "yes", "no", "True"],
        ["Junior", "Python", "no", "yes", "False"]
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(interview_table, interview_header, "interviewed_well")
    interview_table = myutils.drop_column(interview_table, interview_header, "interviewed_well")
    X_train = interview_table
    my_dt.fit(X_train, y_train)
    
    X_test = [["Junior", "Java", "yes", "no"],
              ["Junior", "Java", "yes", "yes"]]
    y_test = ["True", "False"]
    
    assert myutils.equivalent(my_dt.predict(X_test), y_test)
    
    # bramer degrees dataset
    degrees_header = ["SoftEng", "ARIN", "HCI", "CSA", "Project", "Class"]
    degrees_table = [
        ["A", "B", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "A", "B", "B", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "A", "A", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "A", "B", "FIRST"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "B", "B", "SECOND"],
        ["B", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "B", "A", "A", "FIRST"],
        ["B", "B", "B", "A", "A", "SECOND"],
        ["B", "B", "A", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["B", "A", "B", "A", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "B", "A", "B", "B", "SECOND"],
        ["B", "A", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(degrees_table, degrees_header, "Class")
    degrees_table = myutils.drop_column(degrees_table, degrees_header, "Class")
    X_train = degrees_table
    my_dt.fit(X_train, y_train)
    
    X_test = [
        ["B", "B", "B", "B", "B"],
        ["A", "A", "A", "A", "A"],
        ["A", "A", "A", "A", "B"]
    ]
    y_test = ["SECOND", "FIRST", "FIRST"]
    
    assert myutils.equivalent(my_dt.predict(X_test), y_test)
    
    # After this we can feel pretty darn good about our implementation.
    # Because it was tricky I'm going to run a good few more tests on the back end.
    
def test_decision_tree_classifier_print_rules():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [
        ["Senior", "Java", "no", "no", "False"],
        ["Senior", "Java", "no", "yes", "False"],
        ["Mid", "Python", "no", "no", "True"],
        ["Junior", "Python", "no", "no", "True"],
        ["Junior", "R", "yes", "no", "True"],
        ["Junior", "R", "yes", "yes", "False"],
        ["Mid", "R", "yes", "yes", "True"],
        ["Senior", "Python", "no", "no", "False"],
        ["Senior", "R", "yes", "no", "True"],
        ["Junior", "Python", "yes", "no", "True"],
        ["Senior", "Python", "yes", "yes", "True"],
        ["Mid", "Python", "no", "yes", "True"],
        ["Mid", "Java", "yes", "no", "True"],
        ["Junior", "Python", "no", "yes", "False"]
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(interview_table, interview_header, "interviewed_well")
    interview_table = myutils.drop_column(interview_table, interview_header, "interviewed_well")
    X_train = interview_table
    my_dt.fit(X_train, y_train)
    
    print("Interview Tree Rules:")
    my_dt.print_decision_rules(interview_header[:-1], interview_header[-1])
    print()
    
    # bramer degrees dataset
    degrees_header = ["SoftEng", "ARIN", "HCI", "CSA", "Project", "Class"]
    degrees_table = [
        ["A", "B", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "A", "B", "B", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "A", "A", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "A", "B", "FIRST"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "B", "B", "SECOND"],
        ["B", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "B", "A", "A", "FIRST"],
        ["B", "B", "B", "A", "A", "SECOND"],
        ["B", "B", "A", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["B", "A", "B", "A", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "B", "A", "B", "B", "SECOND"],
        ["B", "A", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(degrees_table, degrees_header, "Class")
    degrees_table = myutils.drop_column(degrees_table, degrees_header, "Class")
    X_train = degrees_table
    my_dt.fit(X_train, y_train)
    
    print("Degrees Tree Rules:")
    my_dt.print_decision_rules(degrees_header[:-1], degrees_header[-1])
    print()
    
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [
        ["Senior", "Java", "no", "no", "False"],
        ["Senior", "Java", "no", "yes", "False"],
        ["Mid", "Python", "no", "no", "True"],
        ["Junior", "Python", "no", "no", "True"],
        ["Junior", "R", "yes", "no", "True"],
        ["Junior", "R", "yes", "yes", "False"],
        ["Mid", "R", "yes", "yes", "True"],
        ["Senior", "Python", "no", "no", "False"],
        ["Senior", "R", "yes", "no", "True"],
        ["Junior", "Python", "yes", "no", "True"],
        ["Senior", "Python", "yes", "yes", "True"],
        ["Mid", "Python", "no", "yes", "True"],
        ["Mid", "Java", "yes", "no", "True"],
        ["Junior", "Python", "no", "yes", "False"]
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(interview_table, interview_header, "interviewed_well")
    interview_table = myutils.drop_column(interview_table, interview_header, "interviewed_well")
    X_train = interview_table
    my_dt.fit(X_train, y_train)
    
    print("Interview Tree Rules with generic names:")
    my_dt.print_decision_rules()
    print()
    
    # bramer degrees dataset
    degrees_header = ["SoftEng", "ARIN", "HCI", "CSA", "Project", "Class"]
    degrees_table = [
        ["A", "B", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "A", "B", "B", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "A", "A", "A", "FIRST"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["B", "A", "A", "B", "B", "SECOND"],
        ["A", "B", "B", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "A", "B", "FIRST"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["A", "A", "B", "B", "B", "SECOND"],
        ["B", "B", "B", "B", "B", "SECOND"],
        ["A", "A", "B", "A", "A", "FIRST"],
        ["B", "B", "B", "A", "A", "SECOND"],
        ["B", "B", "A", "A", "B", "SECOND"],
        ["B", "B", "B", "B", "A", "SECOND"],
        ["B", "A", "B", "A", "B", "SECOND"],
        ["A", "B", "B", "B", "A", "FIRST"],
        ["A", "B", "A", "B", "B", "SECOND"],
        ["B", "A", "B", "B", "B", "SECOND"],
        ["A", "B", "B", "B", "B", "SECOND"],
    ]
    
    my_dt = MyDecisionTreeClassifier()
    y_train = myutils.get_column(degrees_table, degrees_header, "Class")
    degrees_table = myutils.drop_column(degrees_table, degrees_header, "Class")
    X_train = degrees_table
    my_dt.fit(X_train, y_train)
    
    print("Degrees Tree Rules with generic names:")
    my_dt.print_decision_rules()
    print()
    
test_decision_tree_classifier_fit()
test_decision_tree_classifier_predict()
test_decision_tree_classifier_print_rules()

Interview Tree Rules:
IF level == Junior AND phd == no THEN interviewed_well = True
IF level == Junior AND phd == yes THEN interviewed_well = False
IF level == Mid THEN interviewed_well = True
IF level == Senior AND tweets == no THEN interviewed_well = False
IF level == Senior AND tweets == yes THEN interviewed_well = True

Degrees Tree Rules:
IF SoftEng == A AND Project == A THEN Class = FIRST
IF SoftEng == A AND Project == B AND CSA == A AND ARIN == A THEN Class = FIRST
IF SoftEng == A AND Project == B AND CSA == A AND ARIN == B THEN Class = SECOND
IF SoftEng == A AND Project == B AND CSA == B THEN Class = SECOND
IF SoftEng == B THEN Class = SECOND

Interview Tree Rules with generic names:
IF att0 == Junior AND att3 == no THEN class = True
IF att0 == Junior AND att3 == yes THEN class = False
IF att0 == Mid THEN class = True
IF att0 == Senior AND att2 == no THEN class = False
IF att0 == Senior AND att2 == yes THEN class = True

Degrees Tree Rules with generic names:
IF att0 == A AND a