In [5]:
import numpy as np
from numba import jit
import time

In [33]:
class Tree:
    def __init__(self, max_depth = 3, min_samples = 1, min_child_weight = 1, min_impurity = 0, gamma = 0):
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.min_child_weight = min_child_weight
        self.min_impurity = min_impurity
        self.gamma = gamma
        self.tree = {}
        self.fbs_time = 0

    def split_data(self, X, feature_idx, split_value):
        left_idx = X[:, feature_idx] <= split_value
        right_idx = X[:, feature_idx] > split_value
        return left_idx, right_idx

    def similarity(self, residual, probs):
        numerator = np.sum(residual) ** 2
        denominator = np.sum(probs * (1 - probs)) + self.min_impurity
        return numerator / denominator
    
    def cover(self, probs):
        return np.sum(probs * (1 - probs))

    def find_best_split(self, X, residual, probs):
        best_gain = -np.inf
        best_split_feature_idx = None
        best_split_value = None

        for feature_idx in range(X.shape[1]):
            feature_values = X[:, feature_idx]
            unique = np.unique(feature_values)
            split_values = np.zeros(len(unique) - 1)

            for i in range(len(split_values)):
                split_values[i] = (unique[i] + unique[i + 1]) / 2

            for value in split_values:
                left_idx, right_idx = self.split_data(X, feature_idx, value)
                p_left = probs[left_idx]
                p_right = probs[right_idx]

                if (len(left_idx) < self.min_samples or len(right_idx) < self.min_samples
                    or self.cover(p_left) < self.min_child_weight or self.cover(p_right) < self.min_child_weight):
                    continue

                r_left = residual[left_idx]
                r_right = residual[right_idx]
                
                gain = self.similarity(r_left, p_left) + self.similarity(r_right, p_right) - self.similarity(residual, probs) 

                if gain > best_gain:
                    best_gain = gain
                    best_split_feature_idx = feature_idx
                    best_split_value = value

        if(best_gain - self.gamma < 0):
            best_split_feature_idx = None
            best_split_value = None

        return best_split_feature_idx, best_split_value


    def compute_output(self, residual, probs):
        numerator = np.sum(residual)
        denominator = np.sum(probs * (1 - probs)) + self.min_impurity
        return numerator / denominator
    

    def build_tree(self, X, residual, probs, depth):
        if depth >= self.max_depth or len(X) <= self.min_samples:
            return self.compute_output(residual, probs)

        start = time.time()
        split_feature_idx, split_value = self.find_best_split(X, residual, probs)
        end = time.time()
        self.fbs_time += (end - start)

        if split_feature_idx is None:
            return self.compute_output(residual, probs)

        left_idx, right_idx = self.split_data(X, split_feature_idx, split_value)
        left_child = self.build_tree(X[left_idx], residual[left_idx], probs[left_idx], depth + 1)
        right_child = self.build_tree(X[right_idx], residual[right_idx], probs[right_idx], depth + 1)

        self.tree = {
            'split_feature_idx': split_feature_idx,
            'split_value': split_value,
            'left_child': left_child,
            'right_child': right_child
        }
        return self.tree


    def get_output(self, x, tree):
        if isinstance(tree, dict):
            split_feature_idx = tree['split_feature_idx']
            split_value = tree['split_value']
            if x[split_feature_idx] <= split_value:
                return self.get_output(x, tree['left_child'])
            else:
                return self.get_output(x, tree['right_child'])
        else:
            return tree
        
        
    def fit(self, X, residual, probs):
        depth = 0
        self.tree = self.build_tree(X, residual, probs, depth)


    def predict(self, X):
        return np.array([self.get_output(x, self.tree) for x in X])

In [34]:
class XGBoost:
    def __init__(self, n_estimators, learning_rate, min_impurity = 1e-7, gamma = 0, min_child_weight = 1, max_depth = 3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.initial_prediction = 0.5
        self.min_impurity = min_impurity
        self.min_child_weight = min_child_weight
        self.max_depth = max_depth
        self.gamma = gamma
        self.models = []
        self.fbs_time = 0
        self.logodds_time = 0
        self.residual_time = 0
        self.logodds_predict_time = 0
        self.pred_time = 0

    def compute_logodds(self, p):
        return np.log(p / (1 - p)).astype(np.float64)

    def residual(self, y_true, y_pred):
        return (y_true - y_pred).astype(np.float64)

    def fit(self, X, y):
        data = X
        predictions = np.full(len(y), self.initial_prediction, dtype = np.float64)

        for _ in range(self.n_estimators):
            probs = np.copy(predictions)
            start = time.time()
            residual = self.residual(y, predictions)
            end = time.time()
            self.residual_time += (end - start)

            model = Tree(min_impurity = self.min_impurity, gamma = self.gamma, max_depth = self.max_depth)
            model.fit(data, residual, probs)
            self.fbs_time += model.fbs_time

            start = time.time()
            log_odds = self.compute_logodds(predictions)
            end = time.time()
            self.logodds_time += (end - start)

            start = time.time()
            temp = log_odds + self.learning_rate * model.predict(data)
            end = time.time()
            self.logodds_predict_time += (end - start)

            start = time.time()
            predictions = np.around(np.exp(temp) / (1 + np.exp(temp)), decimals = 14)
            end = time.time()
            self.pred_time += (end - start)

            self.models.append(model)

    def predict_proba(self, X):
        predictions = np.full(len(X), self.initial_prediction)
        for model in self.models:
            temp = np.log(predictions / (1 - predictions)) + self.learning_rate * model.predict(X)
            predictions = np.around(np.exp(temp) / (1 + np.exp(temp)), decimals = 14)

        return predictions

In [8]:
data = np.load('test_2label.npz', allow_pickle=True)
X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']

In [30]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
model = xgb.XGBClassifier(n_estimators = 100, learning_rate = 0.3, gamma = 0, max_depth = 3, min_child_weight = 1)
model.fit(X_train, y_train)
y_pred2 = model.predict(X_test)
print("Thư viện có sẵn : " + str(accuracy_score(y_test, y_pred2)))

Thư viện có sẵn : 0.78


In [36]:
xgb_model = XGBoost(n_estimators = 100, learning_rate = 0.3, min_impurity = 0, gamma = 0)
start = time.time()
xgb_model.fit(X_train, y_train)
end = time.time()
pred = xgb_model.predict_proba(X_test)
y_pred1 = (pred > 0.5).astype(int)
print("Tự build : " + str(accuracy_score(y_test, y_pred1)))

Tự build : 0.7875


In [31]:
print(f'Total training time: {end - start} seconds')
print(f'Find best split: {xgb_model.fbs_time} seconds')
print(f'Tính residual: {xgb_model.residual_time} seconds')

Total training time: 113.34811019897461 seconds
Find best split: 112.6448016166687 seconds
Tính residual: 0.0018894672393798828 seconds


In [None]:
''' Đọc data
train = np.load('<tên file>.npz',allow_pickle=True)
X_train = train['data']
y_train = train['label']

test = np.load('<tên file>.npz',allow_pickle=True)
X_test = test['data']
y_test = test['label']
'''

In [None]:
''' Mã giả của MultiClassifier
class Multi:
    __init__():
        self.models = []
        self.time = 0

    fit():
        for y_i in labels:
            y_2labels = (y == y_i).astype(int)
            model = XGBoost(...)
            model.fit(X, y_2labels)
            self.models.append(model)
            Tính thời gian + lưu thời gian

    predict():
        preds = []
        for model in self.models:
            preds.append(model.predict_proba(X_test))
        y_pred = np.argmax(preds, axis = 0)

        return y_pred

    show_time():  -> để show ra hàm nào mất thời gian nhất & cần song song -> tùy ý 
        ....
'''