In [250]:
from random import choice, randint
import pandas as pd
import numpy as np

In [252]:
data = pd.read_csv("personality_dataset.csv")

In [254]:
print(data.head())

   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 13.0             5.0   Extrovert  
1                       Yes                  0.0             3.0   Introvert  
2                       Yes                  5.0             2.0   Introvert  
3                        No                 14.0             8.0   Extrovert  
4                        No                  8.0             5.0   Extrovert  


In [256]:
data = data.replace({"Yes":1,"No":0,"Extrovert":1,"Introvert":0})

  data = data.replace({"Yes":1,"No":0,"Extrovert":1,"Introvert":0})


In [258]:
binary = ["Stage_fear","Drained_after_socializing"]
cont = ['Time_spent_Alone', 'Social_event_attendance',
       'Going_outside', 'Friends_circle_size',
       'Post_frequency', 'Personality']

In [260]:
data[cont] = data[cont].fillna(data.mean())
data[binary] = data[binary].fillna(0)

In [262]:
data2 = {
    "age": [21, 25, 32, 34, 47, 50, 51, 54, 57, 60, 62, 65, 67, 71, 74],
    "owns_house": [0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1],
    "married": [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1],
    "purchased": [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1],
}

test2 = pd.DataFrame.from_dict(data2)

In [264]:
class Tree:
    def __init__(self, split: dict = None, data_left = None, data_right = None):
        self.node  = split
        self.left  = data_left  
        self.right = data_right 
    def __repr__(self):
        def print_tree(tree,ident = "\t"):
            if isinstance(tree,Tree):
                return f'''
                Node : {tree.node}\n{ident}
                Leaf 1 :\n{print_tree(tree.left,2*ident)}\n{ident}
                Leaf 2 :\n{print_tree(tree.right,2*ident)}'''
                
            if isinstance(tree, pd.DataFrame):
                if tree.empty: return ""
                return tree.to_string()
            if tree == None: return ""
            else: return str(tree)
                
        return f'''
            Node : {repr(self.node)}\n
            Leaf 1 :\n\t{print_tree(self.left)}\n
            Leaf 2 :\n\t{print_tree(self.right)}'''

In [544]:
class DecisionTree:
    def __init__(self, dataset : pd.DataFrame, columns : list[int] = None)-> None:
        self.data       = dataset if columns == None else dataset.iloc[:,columns+[-1]]
        self.pred_tree  = Tree()
        self.raw_tree   = Tree()
        self.split_info = {i:self.continuous_value(i) for i in self.data.columns[:-1]}
        self.acc        = 0
        
    def __repr__(self)-> str:
        return pd.DataFrame.to_string(self.data)
    def __str__(self)-> str :
        return pd.DataFrame.to_string(self.data)

    def _pred(self, _data : pd.DataFrame):
        pass
    
    def continuous_value(self, split,cont_cap : int = 5)-> bool:
        '''return whether data is has continuous value given continuity cap. using >'''
        values = self.data[split].unique()
        return len(values) > cont_cap
        
    def _raw_to_pred(self,tree : Tree) -> Tree:
        '''turns a tree with datasets in leafs into a tree with predictions in leafs.'''
        if isinstance(tree,Tree):
            return Tree(tree.node,self._raw_to_pred(tree.left),self._raw_to_pred(tree.right))
        elif isinstance(tree, pd.DataFrame):
            return self._pred(tree)

    def _metric_for_data(self, data):
        pass
    
    def choose_split(self,data : pd.DataFrame,used_splits = [],continuous : int = 5):
        '''
        chooses the best split for data.
        returns split, value to split by, left split, right split
        '''
        if data.empty:
            return None
        splits = data.columns[:-1]
        choose_from = []
        for split in splits:
            values = data[split].unique()
            split_by = self._choose_category(data,split,values,used_splits, continuous)
            if split_by == None:
                continue
            choose_from.append(split_by)
        if choose_from == []: return None
        best_split = min(choose_from, key= lambda x : x["metric"])
        return best_split

        
    def _choose_category(self,data : pd.DataFrame, split, values, used_splits = [],continuous : int = 5)-> dict:
        '''
        given split and list of distinct values in that split we choose which value 
        returns the best(smallest) gini index and we return that split with the value with its corresponding metric( MSE or gini index). Additionally 
        we return both splits(data = split, data != split), with continuous data splits are (data <= split, data > split).
        '''
        if self.continuous_value(split, continuous):
            sorted_values = np.sort(values)
            average_values = [(sorted_values[i]+sorted_values[i+1])/2 for i in range(len(values)-1)]
            splits = [(i,data[data[split] <= i],data[data[split] > i]) for i in average_values if (split,i) not in used_splits]
        else:
            splits = [(i,data[data[split] == i],data[data[split] != i]) for i in values if (split,i) not in used_splits]
            
        if splits == []: return None
        info = []
        for i,data_left,data_right in splits:
            metric_data_l = self._metric_for_data(data_left)
            metric_data_r = self._metric_for_data(data_right)
            total = metric_data_l[1] + metric_data_r[1]
            #gini for the split is weighted average of gini's for subtrees.
            metric_total = metric_data_l[0] * (metric_data_l[1]/total) + metric_data_r[0] * (metric_data_r[1]/total)
            info.append({"split" : split, "value" : i,"metric" : metric_total,"left": data_left,"right" :data_right})
        return min(info,key= lambda x : x["metric"])
    
    def create_tree(self, limit : int = 1,depth_limit : int = 5,continuous_cap : int = 2)-> None:
        '''created tree from the data, which is avalible under self.pred_tree and self.raw_tree.'''
        def create_rec(data : pd.DataFrame, depth : int = 0 ,used_splits : list = [] ):
            if data.empty: return None
            if depth > depth_limit or data.empty:
                return data
            elif len(data) <= limit:
                return data
            else:
                split_dict = self.choose_split(data,used_splits, continuous_cap)
                if split_dict == None: return None
                split = {"split" : split_dict["split"], "value" : split_dict["value"]}
                used_splits.append((split["split"],split["value"]))
                left = create_rec(split_dict["left"], depth + 1, used_splits)
                right = create_rec(split_dict["right"], depth + 1, used_splits)
                return Tree(split, left, right)
            
        self.raw_tree = create_rec(self.data)
        self.pred_tree = self._raw_to_pred(self.raw_tree)

    def pred_new(self,to_pred):
        def go_down_tree(tree : Tree):
            if isinstance(tree,Tree):
                split,value = tree.node["split"],tree.node["value"]
                cont = self.split_info[split]
                
                if cont:
                    if (to_pred[split] < value).iloc[0]:
                        return go_down_tree(tree.left)
                    else:
                        return go_down_tree(tree.right)
                else:
                    if (to_pred[split] == value).iloc[0]:
                        return go_down_tree(tree.left)
                    else:
                        return go_down_tree(tree.right)
            else:
                return tree
        return go_down_tree(self.pred_tree)

    def tree_acc(self,metric)-> float:
        def helper(tree_preds : Tree, tree_raw : Tree):
            if isinstance(tree_preds,Tree):
                return helper(tree_preds.left,tree_raw.left) + helper(tree_preds.right,tree_raw.right)
            else:
                if isinstance(tree_raw, pd.DataFrame):
                    if tree_raw.empty: return 0
                    return metric(tree_raw.iloc[:,-1:],tree_preds).sum()
                return 0
        return helper(self.pred_tree,self.raw_tree)/len(self.data)

In [546]:
class ClassificationTree(DecisionTree):
    def __init__(self, dataset : pd.DataFrame, columns : list[int] = None)-> None:
        super().__init__(dataset, columns)
        
    def _pred(self,data : pd.DataFrame):
        '''chooses which value appears the most in a given leaf.'''
        return data.iloc[:,-1:].value_counts().idxmax()[0]
    
    def _gini_index(self, proportions : list[int]) -> int:
        '''returns gini index of the proportions.'''
        return 1 - sum(i**2 for i in proportions)
        
    def _metric_for_data(self, data : pd.DataFrame)->tuple:
        '''returns gini index for the split, with the amount of elements in data for the weighted average. '''
        target = data.iloc[:,-1]
        dist_value_counts = target.value_counts()
        split_count = dist_value_counts.sum()
        proportions = [i/split_count for i in dist_value_counts]
        gini = self._gini_index(proportions)
        return gini,split_count
    
    def tree_acc(self):
        def metric(data,preds):
            return data == preds 
        return super().tree_acc(metric)

In [548]:
class RegressionTree(DecisionTree):
    def __init__(self, dataset : pd.DataFrame, columns : list[int] = None)-> None:
        super().__init__(dataset, columns)
        
    def _pred(self,data : pd.DataFrame):
        '''return the average for the given leaf.'''
        return data.iloc[:,-1:].sum()/len(data)
    
    def _mse(self, values : list[int]) -> int:
        '''returns mean squared error .'''
        return sum(i**2 for i in values)/len(values)
        
    def _metric_for_data(self, data : pd.DataFrame)->tuple:
        '''returns mse for the split, with the amount of elements in data for the weighted average. '''
        target = data.iloc[:,-1]
        mean = target.mean()
        mse = self._mse(target - mean)
        return mse,len(data)
    
    def tree_acc(self):
        def metric(data,pred):
            return self._mse(data - pred)
        return super().tree_acc(metric)

In [550]:
personality_tree = ClassificationTree(data)

In [612]:
personality_tree.create_tree(depth_limit = 3)

In [618]:
personality_tree.pred_tree


            Node : {'split': 'Stage_fear', 'value': 0.0}

            Leaf 1 :
	
                Node : {'split': 'Post_frequency', 'value': 2.5}
	
                Leaf 1 :

                Node : {'split': 'Time_spent_Alone', 'value': 4.25290800140994}
		
                Leaf 1 :

                Node : {'split': 'Social_event_attendance', 'value': 1.0}
				
                Leaf 1 :
0
				
                Leaf 2 :
0
		
                Leaf 2 :

                Node : {'split': 'Time_spent_Alone', 'value': 4.75290800140994}
				
                Leaf 1 :
0
				
                Leaf 2 :
0
	
                Leaf 2 :

                Node : {'split': 'Friends_circle_size', 'value': 7.5}
		
                Leaf 1 :

                Node : {'split': 'Social_event_attendance', 'value': 4.5}
				
                Leaf 1 :
1
				
                Leaf 2 :
1
		
                Leaf 2 :

                Node : {'split': 'Going_outside', 'value': 4.5}
				
                Leaf 1 :
1
				
          

In [632]:
from graphviz import Digraph
def visualize_tree(tree, graph=None, parent=None, edge_label=""):
    if graph is None:
        graph = Digraph()
    
    node_id = str(id(tree))

    if isinstance(tree, dict):
        # Jest to node (słownik): np. {"X > 5": [lewo, prawo]}
        condition = next(iter(tree))
        graph.node(node_id, label=condition)

        left, right = tree[condition]

        # Rekurencja w lewo
        left_id = visualize_tree(left, graph, node_id, "Yes")
        graph.edge(node_id, left_id, label="Yes")

        # Rekurencja w prawo
        right_id = visualize_tree(right, graph, node_id, "No")
        graph.edge(node_id, right_id, label="No")
        
    else:
        # Liść drzewa (wartość 0 lub 1)
        graph.node(node_id, label=str(tree), shape="box")

    return node_id if parent is not None else graph

In [634]:
tree = {
    "X > 5": [
        {"Y < 3": [
            0,
            1
        ]},
        1
    ]
}
visualize_tree(tree)

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.graphs.Digraph at 0x2965dfc58e0>

In [616]:
personality_tree.tree_acc()

Personality    0.934483
dtype: float64

In [394]:
# tree2 = ClassificationTree(test2)

In [396]:
# tree2.create_tree(depth_limit= 2,limit = 1)

In [398]:
# print(tree2.raw_tree)
# print(tree2.pred_tree)

In [400]:
new_val = {"age" : [32], 
           "owns_house" : [1],
          "married" : [0],
          }
to_guess = pd.DataFrame.from_dict(new_val)
# print(to_guess)
# print(test2)

In [402]:
# tree2.pred_new(to_guess)

In [404]:
# to_guess["purchased"] = tree2.pred_new(to_guess)

In [406]:
# print(tree2.raw_tree)

In [408]:
# print(tree2.tree_acc())