In [10]:
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

class TreeNode():
    
    def __init__(self, model=None, C=None, left=None, right=None):
        self.model = model
        self.C = C
        self.left = left
        self.right = right

def trainLinear(linear, x, y):
    #使用sklearn库的最小二乘估计训练一个线性模型
    linear.fit(x, y)
    return linear

def binaryTrainSet(linear, x, y):
    #根据线性回归模型二分数据集
    #对样本x[i],其线性模型预测值若小于等于0,分到x0集合;若大于0,分到x1集合;相应的标签也划分的y0,y1集合
    x0 = []
    x1 = []
    y0 = []
    y1 = []
    p = linear.predict(x)
    for i in range(p.shape[0]):
        if p[i] <= 0:
            x0.append(x[i])
            y0.append(y[i])
        else:
            x1.append(x[i])
            y1.append(y[i])
    return np.array(x0), np.array(x1), np.array(y0), np.array(y1)

def score(linear, x, y):
    #计算线性模型linear的精度
    right = 0
    p = linear.predict(x)
    for i in range(p.shape[0]):
        if p[i]<=0 and y[i]==-1 or p[i]>0 and y[i]==1:
            right += 1
    return right / x.shape[0]
    
def treeGenerate(root, x, y, precision):
    #递归建造决策树
    root.model = LinearRegression()
    root.model = trainLinear(root.model, x, y)
    x0, x1, y0, y1 = binaryTrainSet(root.model, x, y)
    
    #构建当前结点左分支
    if len(x0)==0 or score(root.model, x0, y0)>= precision:
        #左分支训练集为空或当前结点的线性模型对左分支的训练样本精度达到了阈值要求(precision),将左分支构建为叶子节点
        root.left = TreeNode(C=-1)
    else:
        #左分支结点精度不够要求,还需进行划分
        root.left = TreeNode()
        treeGenerate(root.left, x0, y0, precision)
    
    #构建当前结点右分支
    if len(x1)==0 or score(root.model, x1, y1) >= precision:
        root.right = TreeNode(C=1)
    else:
        root.right = TreeNode()
        treeGenerate(root.right, x1, y1, precision)

def predict(root, xs):
    #使用以root为根结点的决策树预测样本s
    if root.C is not None:
        #root为叶子结点
        return root.C
    else:
        if root.model.predict(np.expand_dims(xs, axis=0)) <= 0:
            return predict(root.left, xs)
        else:
            return predict(root.right, xs)

def evaluate(root, x, y):
    #计算以root为根结点的决策树在数据集x上的精度
    right = 0
    for i in range(x.shape[0]):
        if predict(root, x[i]) == y[i]:
            right += 1
    return right / x.shape[0]

if __name__ == '__main__':
    #加载乳腺癌数据集
    cancer = load_breast_cancer()

    #参数random_state是指随机生成器,测试集占全部数据的33%
    X_train, X_test, y_train, y_test = train_test_split(cancer['data'],cancer['target'], test_size=0.33, random_state=42)
    
    #将y_train与y_test标签中的0全部改为-1
    y_train[y_train == 0] = -1
    y_test[y_test == 0] = -1

    #数据标准化
    X_train = preprocessing.scale(X_train)
    X_test = preprocessing.scale(X_test)
    
    #构建决策树
    root = TreeNode()
    #此处的阈值不能设的太大,由于数据本身就有一定客观存在的误差,无法做到100%精度,阈值设的太大容易爆栈
    treeGenerate(root, X_train, y_train, 0.96)
    
    #计算训练好的决策树在测试集上的精度
    scoreTest = evaluate(root, X_test, y_test)
    print('测试集精度为:', round(scoreTest, 4))


测试集精度为: 0.9628


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


class MultivariateDecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # 停止条件
        if depth == self.max_depth or n_labels == 1:
            return np.bincount(y).argmax()

        best_gain = -1
        best_split = None
        for _ in range(10):  # 随机尝试一些线性组合
            weights = np.random.randn(n_features)
            thresholds = np.linspace(np.min(np.dot(X, weights)), np.max(np.dot(X, weights)), 10)
            for threshold in thresholds:
                left_indices = np.dot(X, weights) < threshold
                right_indices = ~left_indices
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                gain = self._information_gain(y, y[left_indices], y[right_indices])
                if gain > best_gain:
                    best_gain = gain
                    best_split = (weights, threshold)

        if best_gain == -1:
            return np.bincount(y).argmax()

        weights, threshold = best_split
        left_indices = np.dot(X, weights) < threshold
        right_indices = ~left_indices
        left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return (weights, threshold, left_subtree, right_subtree)

    def _information_gain(self, parent, left, right):
        p = len(left) / len(parent)
        return self._gini_impurity(parent) - p * self._gini_impurity(left) - (1 - p) * self._gini_impurity(right)

    def _gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        impurity = 1
        for count in counts:
            probability = count / len(y)
            impurity -= probability ** 2
        return impurity

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if isinstance(node, (int, np.integer)):
            return node
        weights, threshold, left_subtree, right_subtree = node
        if np.dot(x, weights) < threshold:
            return self._traverse_tree(x, left_subtree)
        else:
            return self._traverse_tree(x, right_subtree)
    
    def print_tree(self):
        print(self.tree)




In [None]:
# 生成一个具有特征交互的数据集
def generate_complex_dataset(n_samples=1000, n_features=20):
    X = np.random.randn(n_samples, n_features)
    # 定义更复杂的规则，涉及多个特征的非线性组合
    y = ((X[:, 0] * X[:, 1] + X[:, 2] * X[:, 3]) * np.cos(X[:, 4]) + np.sin(X[:, 5]) * X[:, 6]) > 0
    y = y.astype(int)
    return X, y


# 生成数据集
X, y = generate_complex_dataset()

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 传统决策树模型
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)
single_tree_pred = single_tree.predict(X_test)
single_tree_accuracy = accuracy_score(y_test, single_tree_pred)

# 多变量决策树模型
multi_tree = MultivariateDecisionTree(max_depth=5)
multi_tree.fit(X_train, y_train)
multi_tree_pred = multi_tree.predict(X_test)
multi_tree_accuracy = accuracy_score(y_test, multi_tree_pred)

# 输出结果
print(f"传统决策树的准确率: {single_tree_accuracy:.4f}")
print(f"多变量决策树的准确率: {multi_tree_accuracy:.4f}")

## 运行结果：
'''
传统决策树的准确率: 0.5000
多变量决策树的准确率: 0.5950
'''


(array([-0.37911062, -0.77495183, -1.07286549,  0.01130297,  0.14936678,
       -0.65324754,  0.7258939 ,  1.13466387, -0.02266037, -0.52108589,
        0.25562822, -0.60208735,  0.35258349, -1.01532521,  0.2290356 ,
        0.89588802, -0.27231618,  0.67432161,  0.58818625,  0.16331979]), 1.6275244034753538, (array([-0.50819058, -0.3447874 , -1.53204579, -0.77332478,  0.31573106,
       -1.55046424, -2.10466126,  0.22040419,  0.4611451 ,  0.70578684,
        0.39784242,  0.19074872, -0.50738606,  0.69407804,  1.8388488 ,
       -1.6335679 , -0.28812144,  0.30737951,  0.91730457,  0.16929404]), -8.60940958304003, (array([-8.96482643e-01, -4.41627874e-01, -5.92344829e-01,  8.48202614e-01,
       -1.89799724e+00, -1.92212562e-02, -9.82183406e-01, -1.06721240e+00,
        1.07252818e-02, -1.76915798e+00,  3.42032425e+00,  7.47168782e-01,
        7.44664302e-02, -5.59277617e-02,  1.55860235e+00,  4.94320703e-04,
        4.93930890e-01, -1.70877543e+00, -9.51122466e-01,  8.73370291e-01]), 5

'\n传统决策树的准确率: 0.5000\n多变量决策树的准确率: 0.5950\n'