In [None]:
import pandas as pd
import time as tm
import numpy as np
import math

from sklearn.model_selection import train_test_split

In [None]:
'''
决策树
'''
class decision_tree_regressor:

    def __init__(self, max_fec_num, max_depth):
        self.max_fec_num = max_fec_num
        self.max_depth = max_depth

    '''
    计算方差
    '''
    def cal_variance(self, label):
        return (np.var(label) * label.shape[0]).item()

    '''
    计算平均数
    '''
    def cal_mean(self, label):
        return np.mean(label)
    
    '''
    划分数据集
    '''
    def split_dataset(self, data, label, index, value):
        l = np.nonzero(data.iloc[:, index] < value)[0]
        r = np.nonzero(data.iloc[:, index] > value)[0]
        return data.iloc[l,:], label.iloc[l,:], data.iloc[r,:], label.iloc[r,:]

    '''
    选取指定的最大个特征，在这些个特征中，选取分割时的最优特征
    '''
    def select_best_fec(self, data, label):
        fec_num = data.shape[1]
        best_fec_index, best_fec_value = 0, 0
        fec_index = [np.random.randint(fec_num) for i in range(self.max_fec_num)]
        bestS = float('inf')
        S = self.cal_variance(label)
        for index in fec_index:
            for value in set(data.iloc[:, index]):
                l_x, l_y, r_x, r_y = self.split_dataset(data, label, index, value)
                newS = self.cal_variance(l_y) + self.cal_variance(r_y)
                if newS < bestS:
                    bestS = newS
                    best_fec_index = index
                    best_fec_value = value
        if S - bestS < 0.0000001:
            return None, self.cal_mean(label)
        return best_fec_index, best_fec_value

    def build(self, data, label):
        self.tree = self.__build_tree(data, label, 0)
        return self.tree

    def __build_tree(self, data, label, depth):
        print(depth)
        best_fec_index, best_fec_value = self.select_best_fec(data, label)
        if best_fec_index == None:
            return best_fec_value
        tree = {}
        if depth >= self.max_depth:
            return self.cal_mean(label)
        tree["best_fec"] = best_fec_index
        tree["best_val"] = best_fec_value
        l_x, l_y, r_x, r_y = self.split_dataset(data, label, best_fec_index, best_fec_value)
        tree["left"] = self.__build_tree(l_x, l_y, depth+1)
        tree["right"] = self.__build_tree(r_x, r_y, depth+1)
        return tree

    def predict(self, data):
        if not isinstance(self.tree, dict):
            return None
        return [self.__predict(self.tree, d) for d in data]

    def __predict(self, tree, x):
        if x[tree['best_fec']] > tree['best_val']:
            if type(tree['left']) == float:
                return tree['left']
            return self.__predict(tree['left'], x)
        else:
            if type(tree['right']) == float:
                return tree['right']
            return self.__predict(tree['right'], x)

In [None]:
class random_forest_regressor:
    def __init__(self, n_estimators=10, max_fec_num=10, max_depth=10):
        self.n_estimators = n_estimators
        self.max_fec_num = max_fec_num
        self.max_depth = max_depth

    '''
    基础实现
    '''
    # 基础实现
    def fit(self, data, label):
        self.trees = []
        for _ in range(self.n_estimators):
            dec_tree = decision_tree_regressor(
                self.max_fec_num, self.max_depth)
            tree = dec_tree.build(data, label)
            self.trees.append(tree)

    # 基础实现
    def predict(self, data):
        if not isinstance(self.trees, list):
            return None
        result = np.zeros(data.shape[0], dtype=np.float)
        for tree in self.trees:
            result += tree.predict(data)
        result /= self.n_estimators
        return result
    
    '''
    并行化
    '''
    # 并行训练
    def fit_worker(self, data, label, q):
        dec_tree = decision_tree_regressor(self.max_fec_num, self.max_depth)
        tree = dec_tree.build(data, label)
        q.put(tree)

    def mul_fit(self, data, label):
        if not isinstance(self.trees, list):
            return None
        q = multiprocessing.Queue()
        jobs = []
        for _ in range(self.n_estimators):
            p = multiprocessing.Process(target=self.fit_worker, args=(data, label, q))
            jobs.append(p)
        for p in jobs:
            p.join()
        self.trees = [q.get() for j in jobs]

    # 并行预测
    def predict_worker(self, tree, data, q):
        res = tree.predict(data)
        q.put(res)

    def mul_predict(self, data):
        if not isinstance(self.trees, list):
            return None
        q = multiprocessing.Queue()
        jobs = []
        for tree in self.trees:
            p = multiprocessing.Process(target=self.predict_worker, args=(tree, data, q))
            jobs.append(p)
        for p in jobs:
            p.join()
        result = [q.get() for j in jobs]
        return sum(result) / self.n_estimators

    '''
    进程池并行化
    '''
    def pool_fit(self, data, label):
        if not isinstance(self.trees, list):
            return None
        pool = multiprocessing.Pool(processes=4)
        self.trees = []
        jobs = []
        for _ in range(self.n_estimators):
            p = pool.apply_async(self.fit_worker, (data, label, ))
            jobs.append(p)
        pool.close()
        pool.join()
        self.trees = [j.get() for j in jobs]

    def pool_predict(self, data):
        if not isinstance(self.trees, list):
            return None
        pool = multiprocessing.Pool(processes=4)
        jobs = []
        for tree in self.trees:
            p = pool.apply_async(self.predict_worker, (tree, data, ))
            jobs.append(p)
        pool.close()
        pool.join()
        result = [j.get() for j in jobs]
        return sum(result) / self.n_estimators

In [None]:
data_train_1 = pd.read_csv("./data/train1.csv", header=None)
data_train_2 = pd.read_csv("./data/train2.csv", header=None)
data_train_3 = pd.read_csv("./data/train3.csv", header=None)
data_train_4 = pd.read_csv("./data/train4.csv", header=None)
data_train_5 = pd.read_csv("./data/train5.csv", header=None)

data_test_1 = pd.read_csv("./data/test1.csv", header=None)
data_test_2 = pd.read_csv("./data/test2.csv", header=None)
data_test_3 = pd.read_csv("./data/test3.csv", header=None)
data_test_4 = pd.read_csv("./data/test4.csv", header=None)
data_test_5 = pd.read_csv("./data/test5.csv", header=None)
data_test_6 = pd.read_csv("./data/test6.csv", header=None)

label_1 = pd.read_csv("./data/label1.csv", header=None)
label_2 = pd.read_csv("./data/label2.csv", header=None)
label_3 = pd.read_csv("./data/label3.csv", header=None)
label_4 = pd.read_csv("./data/label4.csv", header=None)
label_5 = pd.read_csv("./data/label5.csv", header=None)

In [None]:
time_begin = tm.time()

x = pd.concat([data_train_1, data_train_2, data_train_3, data_train_4, data_train_5], ignore_index=True)
y = pd.concat([label_1, label_2, label_3, label_4, label_5], ignore_index=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

time_end = tm.time()

random_forest = random_forest_regressor(2, 2, 2)
random_forest.fit(x_train, y_train)

print('total time: ', time_end - time_begin, 's')

In [None]:
# random_forest = random_forest_regressor(8, 5, 8)
# random_forest.fit(x_train, y_train)

In [None]:
# result = random_forest.predict(x_test)

In [None]:
# from sklearn.metrics import r2_score

# score = r2_score(y_test, predictions)
# print("R2 Score: %.2f%%" % (score * 100.0))