### Customized Decision Tree

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv('love_example.csv', delimiter=',')
df = df.iloc[:, 1:]
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print(X.shape, y.shape)

(14, 3) (14,)


In [7]:
X_train = X.iloc[:-1, :]
X_test = X.iloc[:5, :]
y_train = y.iloc[:-1]
y_test = y.iloc[:5]

print('train_set:', X_train.shape, y_train.shape)
print('test_set:', X_test.shape,  y_test)

train_set: (13, 3) (13,)
test_set: (5, 3) 0     No
1     No
2    Yes
3    Yes
4    Yes
Name: LoveAI, dtype: object


In [8]:
def compute_gini_label(data):
    data = np.asarray(data)
    # xử lý data rỗng
    if len(data) == 0:
        return 0.0
    # đếm số lần xuất hiện của các phần tử trong 1 dãy
    count = np.unique(data, return_counts=True)
    prob = count[1]/len(data)
    gini = 1 - np.sum(prob**2)
    return gini

def compute_gini_feature(X_train, y_train):
    gini_feature = []
    for name in X_train.columns:
        total_count = len(X_train[name])
        total_gini = 0
        count = []
        gini = []
        # dạng string
        if X_train[name].dtype == 'object' or X_train[name].dtype.name == 'category':
            label = set(X_train[name])
            for element in label:
                # Điều kiện lọc
                condition = (X_train[name] == element)
                # lọc output theo điều kiện của feature
                data = y_train.loc[condition]
                count.append(len(data))
                gini_data = compute_gini_label(data)
                gini.append(gini_data)

            for i in range(len(count)):
                total_gini += (count[i]/total_count)*gini[i]
            
            gini_feature.append(total_gini)
        
        else:
            gini_history = []
            # lấy index history sau khi sort. mục đích để sắp xếp lại các nhãn theo index này 
            ascending_i_list =  X_train[name].values.argsort()
            # tính giá trị chia nếu được chọn làm cha
            avg_threshold = []

            for threshold in range(1, len(X_train[name])):
                count = []
                gini = []
                total_gini = 0
                # sort cột dự đoán theo label (reset lại index)
                sorted_label_cols = y_train.loc[ascending_i_list].reset_index(drop=True)

                # chia theo ngưỡng thành data1 và data2
                data1 = sorted_label_cols[:threshold]
                count.append(len(data1))
                gini.append(compute_gini_label(data1))
                data2 = sorted_label_cols[threshold:]
                count.append(len(data2))
                gini.append(compute_gini_label(data2))

                val1 = X_train[name].iloc[ascending_i_list[threshold - 1]]
                val2 = X_train[name].iloc[ascending_i_list[threshold]]
                avg_thres = (val1 + val2) / 2
                avg_threshold.append(avg_thres)
                for i in range(len(count)):
                    total_gini += (count[i]/total_count)*gini[i]
                gini_history.append(total_gini)

            min_gini = min(gini_history)
            print(gini_history)
            best_threshold = gini_history.index(min_gini)
            best_avg_threshold = avg_threshold[best_threshold]
            gini_feature.append((min_gini,best_threshold, best_avg_threshold))

    return gini_feature


class Node:
    def __init__(self, is_leaf=False, prediction=None,feature=None, threshold=None, 
                 children=None, left=None, right=None):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.feature = feature
        self.threshold = threshold
        self.children = children  # dict for categorical splits
        self.left = left          # TreeNode for <= threshold
        self.right = right        # TreeNode for > threshold


class HuyIGW04_DecisionTreeClassifier:
    def __init__(self):
        self.root = None

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        self.root = self._build_tree(X_train, y_train)
        return self

    def _build_tree(self, X, y):
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        if len(y.unique()) == 1 or X.shape[1] == 0:
            return Node(is_leaf=True, prediction=y.mode()[0])

        gini_list = compute_gini_feature(X, y)

        best_idx, best_res = min(
            enumerate(gini_list),
            key=lambda item: item[1] if not isinstance(item[1], tuple) else item[1][0]
        )
        feat_name = X.columns[best_idx]
        res = best_res
        # Phân giải kết quả
        if isinstance(res, tuple):
            best_gini, thr_idx, best_thr = res
        else:
            best_gini, thr_idx, best_thr = res, None, None

        # Gini hiện tại của node
        current_gini = compute_gini_label(y)
        if best_gini is None or best_gini >= current_gini:
            return Node(is_leaf=True, prediction=y.mode()[0])

        node = Node(feature=feat_name, threshold=best_thr)

        if best_thr is None:
            node.children = {}
            for val in X[feat_name].unique():
                mask = X[feat_name] == val
                X_sub = X[mask].drop(columns=[feat_name]).reset_index(drop=True)
                y_sub = y[mask].reset_index(drop=True)
                node.children[val] = self._build_tree(X_sub, y_sub)
        else:
            asc_idx = X[feat_name].values.argsort()
            split_pt = thr_idx + 1
            left_idx = asc_idx[:split_pt]
            right_idx = asc_idx[split_pt:]

            X_left = X.iloc[left_idx].reset_index(drop=True)
            y_left = y.iloc[left_idx].reset_index(drop=True)
            X_right = X.iloc[right_idx].reset_index(drop=True)
            y_right = y.iloc[right_idx].reset_index(drop=True)

            node.left = self._build_tree(X_left, y_left)
            node.right = self._build_tree(X_right, y_right)

        return node

    def predict(self, X_test):
        if isinstance(X_test, pd.Series):
            return self._predict_row(self.root, X_test)
        preds = []
        for _, row in X_test.iterrows():
            preds.append(self._predict_row(self.root, row))
        return np.array(preds)

    def _predict_row(self, node, row):
        if node.is_leaf:
            return node.prediction
        val = row[node.feature]
        if node.threshold is None:
            child = node.children.get(val)
            if child is None:
                return max(node.children.values(), key=lambda n: n.prediction).prediction
            return self._predict_row(child, row.drop(labels=[node.feature]))
        else:
            if val <= node.threshold:
                return self._predict_row(node.left, row)
            else:
                return self._predict_row(node.right, row)



In [9]:
model = HuyIGW04_DecisionTreeClassifier()

In [10]:
model.fit(X_train, y_train)

[0.46153846153846156, 0.4965034965034966, 0.4871794871794872, 0.4572649572649572, 0.49230769230769234, 0.4688644688644689, 0.4945054945054946, 0.49230769230769234, 0.4957264957264957, 0.4871794871794872, 0.4195804195804197, 0.46153846153846156]
[0.38095238095238093, 0.37142857142857133, 0.40476190476190477, 0.40476190476190477, 0.34285714285714286, 0.38095238095238093]
[0.3333333333333333, 0.5, 0.3333333333333333]
[0.3333333333333333, 0.3333333333333333]
[0.0]
[0.26666666666666655, 0.25, 0.2222222222222222, 0.25, 0.26666666666666655]
[0.0]


<__main__.HuyIGW04_DecisionTreeClassifier at 0x1b03a90f8f0>

In [11]:
model.predict(pd.DataFrame([{'LoveMath': 'Yes', 'LoveArt': 'No', 'Age': 22}]))

array(['No'], dtype='<U2')

In [12]:
model.predict(pd.DataFrame([{'LoveMath': 'No', 'LoveArt': 'Yes', 'Age': 22}]))

array(['Yes'], dtype='<U3')

In [13]:
df

Unnamed: 0,LoveMath,LoveArt,Age,LoveAI
0,Yes,Yes,20,No
1,Yes,No,9,No
2,No,Yes,18,Yes
3,No,Yes,35,Yes
4,Yes,Yes,30,Yes
5,Yes,No,50,No
6,No,No,46,No
7,Yes,Yes,35,No
8,Yes,No,21,No
9,No,Yes,45,Yes


In [21]:
df.groupby('LoveMath')['LoveAI'].count()

LoveMath
No     5
Yes    9
Name: LoveAI, dtype: int64

In [None]:
df[df['LoveArt']== 'Yes'][['LoveArt', 'LoveAI']]

Unnamed: 0,LoveArt,LoveAI
0,Yes,No
1,Yes,Yes
2,Yes,Yes
3,Yes,Yes
4,Yes,No
5,Yes,Yes
6,Yes,Yes
