In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np

In [3]:
X = np.load('/content/drive/My Drive/Hard ML/Uplift/Lection_2/example_X.npy')
TT = np.load('/content/drive/My Drive/Hard ML/Uplift/Lection_2/example_treatment.npy')
y = np.load('/content/drive/My Drive/Hard ML/Uplift/Lection_2/example_y.npy')
pred_right = np.load('/content/drive/My Drive/Hard ML/Uplift/Lection_2/example_preds.npy')

In [4]:
class Node:
    def __init__(self):
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
        self.ATE = None

In [9]:
class UpliftTreeRegressor:
    def __init__(self,
                 max_depth: int = 3, # максимальная глубина дерева.
                 min_samples_leaf: int = 1000, # минимальное необходимое число обучающих объектов в листе дерева.
                 min_samples_leaf_treated: int = 300, # минимальное необходимое число обучающих объектов с T=1 в листе дерева.
                 min_samples_leaf_control: int = 300, # минимальное необходимое число обучающих объектов с T=0 в листе дерева.
                ):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_leaf_treated = min_samples_leaf_treated
        self.min_samples_leaf_control = min_samples_leaf_control
    
    def fit(self,
            X: np.ndarray, # массив (n * k) с признаками.
            treatment: np.ndarray, # массив (n) с флагом воздействия.
            y: np.ndarray # массив (n) с целевой переменной.
            ) -> None:
        self.n_features_ = X.shape[1]
        self.n_treatment = len(set(treatment))
        self.tree_ = self._build_tree(X,treatment, y)
    
    def _build_tree(self, X, T, y, depth=0):
        node = Node()
        node.ATE = self._calc_uplift(T, y)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, T, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, T_left, y_left = X[indices_left], T[indices_left], y[indices_left]
                X_right, T_right, y_right = X[~indices_left], T[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._build_tree(X_left, T_left, y_left, depth + 1)
                node.right = self._build_tree(X_right, T_right, y_right, depth + 1)
        return node
    
    def _calc_uplift(self, T, y):
        if (sum(T)==0) or (sum([1]-T)==0):
            return 0.0
        else:
            return sum(y*T)/sum(T) - sum(y*([1]-T))/sum([1]-T)

    def _calc_thresholds(self, column_values):
        unique_values = np.unique(column_values)
        if len(unique_values) > 10:
            percentiles = np.percentile(column_values, [3, 5, 10, 20, 30, 50, 70, 80, 90, 95, 97])
        else:
            percentiles = np.percentile(unique_values, [10, 50, 90])
        return np.unique(percentiles)

    def _check_threshold(self, X_leaf, T_leaf):
        if len(X_leaf) < self.min_samples_leaf:
            return 0
        elif sum(T_leaf) < self.min_samples_leaf_treated:
            return 0
        elif sum([1]-T_leaf) < self.min_samples_leaf_control:
            return 0
        else:
            return 1

    def _best_split(self, X, T, y):
        best_deltaDeltaP = 0 # ???
        best_idx, best_thr = None, None

        for idx in range(self.n_features_):
            threshold_options = self._calc_thresholds(X[:, idx])
            for threshold in threshold_options:
                indices_l = X[:, idx] < threshold
                X_l, T_l, y_l = X[indices_l], T[indices_l], y[indices_l]
                X_r, T_r, y_r = X[~indices_l], T[~indices_l], y[~indices_l]

                if (self._check_threshold(X_l, T_l) == 0) or (self._check_threshold(X_r, T_r) == 0):
                    continue

                uplift_left = self._calc_uplift(T_l, y_l)
                uplift_right = self._calc_uplift(T_r, y_r)
                deltaDeltaP = abs(uplift_left-uplift_right)

                if deltaDeltaP > best_deltaDeltaP:
                    best_deltaDeltaP = deltaDeltaP
                    best_idx = idx
                    best_thr = threshold
                    best_uplift_left = uplift_left
                    best_uplift_right = uplift_right
        return best_idx, best_thr


    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
        
    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.ATE    

In [10]:
model = UpliftTreeRegressor(3, 6000, 2500, 2500)
model.fit(X, TT, y)
pred = np.array(model.predict(X)).reshape(len(X))
passed = np.max(np.abs(pred - pred_right))

In [11]:
passed

0.0

In [15]:
_n_treatment = len(set(T))
_num_parent = [np.sum(T == t) for t in range(_n_treatment)]

In [16]:
_num_parent

[24803, 25197]

In [57]:
5-7

-2

In [58]:
abs(5-7)

2

In [22]:
def calc_uplift(T, y):
        if sum(T)==0 or sum([1]-T)==0:
            return 0.0
        else:
            return sum(y*T)/sum(T) - sum(y*([1]-T))/sum([1]-T)

In [23]:
calc_uplift(T, y)

0.8133621067075059

In [33]:
def calc_thresholds(column_values):
    unique_values = np.unique(column_values)
    if len(unique_values) > 10:
        percentiles = np.percentile(column_values, [3, 5, 10, 20, 30, 50, 70, 80, 90, 95, 97])
    else:
        percentiles = np.percentile(unique_values, [10, 50, 90])
    return np.unique(percentiles)

In [27]:
X

array([[ 0.40072881,  0.8084128 ,  0.94338544,  0.19877038, -2.89635463],
       [ 1.11588933, -2.1070723 , -0.5123218 , -0.20680012,  0.70560314],
       [-0.15370419, -0.01068205, -0.38601986,  0.27470753, -0.27732682],
       ...,
       [-0.01556159,  2.64910009, -1.34283527,  1.08404747,  0.05824139],
       [-1.14064404,  0.65152198,  0.74822655, -1.59057866,  1.07153253],
       [ 1.60054183,  0.39258118, -0.26121784, -0.808533  , -0.3083781 ]])

In [28]:
X[:,0]

array([ 0.40072881,  1.11588933, -0.15370419, ..., -0.01556159,
       -1.14064404,  1.60054183])

In [35]:
tr = calc_thresholds(X[:,0])

In [39]:
left_els = []
right_els = np.ndarray([])

In [43]:
for t in tr:
    print(t)
    indices_l = X[:, 0] < t
    X_l, T_l, y_l = X[indices_left], T[indices_left], y[indices_left]
    X_r, T_r, y_r = X[~indices_left], T[~indices_left], y[~indices_left]

-1.8819435415899943
-1.633900676779844
-1.272555905121884
-0.8343029365043768
-0.521641226833238
-0.0040555152405001
0.5253745251188942
0.8428329389786856
1.281227574747424
1.6398533844184269
1.8698002502893631


In [44]:
X_l

array([[ 0.40072881,  0.8084128 ,  0.94338544,  0.19877038, -2.89635463],
       [ 1.11588933, -2.1070723 , -0.5123218 , -0.20680012,  0.70560314],
       [-0.15370419, -0.01068205, -0.38601986,  0.27470753, -0.27732682],
       ...,
       [-0.01556159,  2.64910009, -1.34283527,  1.08404747,  0.05824139],
       [-1.14064404,  0.65152198,  0.74822655, -1.59057866,  1.07153253],
       [ 1.60054183,  0.39258118, -0.26121784, -0.808533  , -0.3083781 ]])

In [46]:
print(len(X_l)+len(X_r))
print(len(X))

50000
50000


In [47]:
T[:] = 1

In [50]:
sum(T)

25197

In [52]:
sum([1]-T)

24803

In [53]:
len(T)

50000

In [54]:
sum([1]-T)+sum(T)

50000

In [56]:
np.unique(T)

array([0, 1])

In [62]:
len(X)

50000

In [63]:
len(pred_right)

50000

In [64]:
len(np.unique(pred_right))

4

In [65]:
np.unique(pred_right)

array([-0.70893913,  0.53816902,  1.57733293,  2.2242965 ])