# Tree_Based_Model

## 기본 세팅

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True) # gdrive 수정 가능하게 함

Mounted at /content/drive/


In [3]:
import sys
assert sys.version_info >= (3, 5), "Python version is low!"

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os
import pandas as pd

np.random.seed(42)

# Matplotlib 그래프 출력 관련 설정
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

# 그림 저장 설정 -> 이거 나중에 일할 때 많이 씀
PROJECT_ROOT_DIR = "/content/drive/MyDrive/2022년/파이썬 패키지/rosnet/garage" #colab 기반이면 이거 수정하는 게 좋음
ID = "TreeBasedModel" #나중에 파일 정리 다시 하기 싫으면 이거 적어둬야 함
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", ID)
os.makedirs(IMAGES_PATH, exist_ok = True)

def save_fig(fig_name, tight_layout = True, fig_extension = "png", dpi=300):
  path = os.path.join(IMAGES_PATH, fig_name + "." + fig_extension)
  print("[이미지 저장] : ", fig_name)
  if tight_layout:
    plt.tight_layout()
  plt.savefig(path, format=fig_extension, dpi=dpi)

def save_fig_fig(fig, fig_name, tight_layout = True, fig_extension = "png", dpi=300):
  path = os.path.join(IMAGES_PATH, fig_name + "." + fig_extension)
  print("[이미지 저장] : ", fig_name)
  if tight_layout:
    plt.tight_layout()
  fig.savefig(path, format=fig_extension, dpi=dpi)

# 데이터를 넘파이로 저장을 하네. 그냥 판다스 내장 함수 쓰는게 더 편할 거 같은데
def save_data(fileName, arrayName, header=''):
  np.savetxt(fileName, arrayName, delimiter=',', header=header, comments='')


import sys
sys.path.append('/content/drive/MyDrive/2022년/파이썬 패키지/rosnet/rosnet')

In [5]:
path = PROJECT_ROOT_DIR + "/data/Titanic_all_end.csv"

data_origin = pd.read_csv(path, index_col = 0)
data_origin.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Fare_q,Age,SibSp,Parch
0,1,0.0,3,0,1,22.0,0,0
1,3,1.0,3,1,3,26.0,0,0
2,5,0.0,3,0,1,35.0,0,0
4,7,0.0,1,0,9,54.0,0,0
5,12,1.0,1,1,7,58.0,0,0


In [6]:
data = data_origin.to_numpy()
data

array([[1.000e+00, 0.000e+00, 3.000e+00, ..., 2.200e+01, 0.000e+00,
        0.000e+00],
       [3.000e+00, 1.000e+00, 3.000e+00, ..., 2.600e+01, 0.000e+00,
        0.000e+00],
       [5.000e+00, 0.000e+00, 3.000e+00, ..., 3.500e+01, 0.000e+00,
        0.000e+00],
       ...,
       [1.234e+03,       nan, 3.000e+00, ..., 3.000e+01, 1.000e+00,
        9.000e+00],
       [1.236e+03,       nan, 3.000e+00, ..., 1.200e+01, 1.000e+00,
        1.000e+00],
       [1.257e+03,       nan, 3.000e+00, ..., 2.900e+01, 1.000e+00,
        9.000e+00]])

In [7]:
x_train = data_origin[~data_origin.Survived.isna()]
y_train = x_train.pop('Survived')
x_train.pop('PassengerId')

x_test = data_origin[data_origin.Survived.isna()]
x_test.pop('Survived')
x_test.pop('PassengerId')

612     892
613     893
614     894
615     895
616     897
       ... 
262    1301
216    1080
242    1234
244    1236
250    1257
Name: PassengerId, Length: 418, dtype: int64

## Decision Tree

### Threshold

In [24]:
class Threshold:
  def __init__(self, unit, axis, col, is_continue):
    self.unit_ = unit
    self.axis_ = axis
    self.is_continue_ = is_continue
    self.col_ = col
  
  def get_cost(self, y, condition):
    cost_true = 1
    cost_false = 1
    y_true = y[condition]
    y_false = y[~condition]
    
    # 이 부분부터 y가 벡터로 고정이 되어 있다.
    tot_true = y_true.shape[0]
    tot_false = y_false.shape[0]
    if tot_true * tot_false == 0: 
      return 2
    tot = tot_true + tot_false

    for uni in np.unique(y):
      count_true = np.sum(y_true == uni)
      count_false = np.sum(y_false == uni)
      cost_true -= np.power(count_true/tot_true, 2)
      cost_false -= np.power(count_false/tot_false, 2)

    cost = tot_true/tot * cost_true + tot_false/tot * cost_false
    return cost

  def divide(self, x, y = None):
    column = np.transpose(x, self.axis_)[self.col_]
    if y is not None:
      if self.is_continue_:
        condition = column < self.unit_
        return x[condition], x[~condition], y[condition], y[~condition]

      elif np.isnan(self.unit_):
        condition = np.isnan(column)
        return x[condition], x[~condition], y[condition], y[~condition]
      
      else:
        condition = column == self.unit_
        return x[condition], x[~condition], y[condition], y[~condition]
    
    else:
      if self.is_continue_:
        condition = column < self.unit_
        return x[condition], x[~condition]

      elif np.isnan(self.unit_):
        condition = np.isnan(column)
        return x[condition], x[~condition]
      
      else:
        condition = column == self.unit_
        return x[condition], x[~condition]


### TreeNode

In [28]:

class TreeNode:
  def __init__(self, parent = None, left = None, right = None):
    if parent is not None:
      self.is_root = False
      self.parent_ = parent
    else:
      self.is_root = True

    if left is None:
      self.left_ = left
    if right is None:
      self.right_ = right
    self.step_ = 0.1
    self.continue_limit_ = 4
    self.cost_ = 1
  def add_left(self, left = None):
    if left is not None:
      self.left_ = left
    else:
      self.left_ = TreeNode(self)
  
  def add_right(self, right):
    if right is not None:
      self.right_ = right
    else:
      self.right_ = TreeNode(self)
  
  def fit(self, x, y):
    except_col = range(len(y.shape))
    d = 0
    for col_max in x.shape:
      if d in except_col:
        d += 1
        continue
      
      for col in range(col_max):
        axis = list(range(len(x.shape)))
        axis.pop(d)
        axis.insert(0, d)
        column = np.transpose(x, axis)[col]
        unique_ = np.unique(column)

        # unique가 discrete로 취급하는게 편한 경우
        if len(unique_) < self.continue_limit_:
          for uni in unique_:
            new_threshold = Threshold(uni, axis, col, False)
            condition = column == uni
            new_cost = new_threshold.get_cost(y, condition)
            if self.cost_ > new_cost:
              self.cost_ = new_cost
              self.threshold_ = new_threshold

        # unique가 conitnue로 취급하는게 편한 경우
        else:
          for uni in unique_:
            if np.isnan(uni):
              new_threshold = Threshold(uni, axis, col, False)
              condition = np.isnan(column)
              new_cost = new_threshold.get_cost(y, condition)
              if self.cost_ > new_cost:
                self.cost_ = new_cost
                self.threshold_ = new_threshold

          for q in np.arange(self.step_, 1, self.step_):
            uni = np.quantile(column, q)
            new_threshold = Threshold(uni, axis, col, True)
            condition = column < uni
            new_cost = new_threshold.get_cost(y, condition)
            if self.cost_ > new_cost:
              self.cost_ = new_cost
              self.threshold_ = new_threshold

    
      d += 1

  def predict(self, x, y=None):
    if y is None:
      return self.threshold_.divide(x)
    else:
      return self.threshold_.divide(x, y)

  # 나중에 causal discovery용으로 수정할 때 이 부분을 수정해야 한다.
  def get_cost(self, x, y, uni, axis, except_col):
    tot = x.shape[0]
    # 일단 axis랑 uni로 데이터를 짤라 내야 한다.

  def __str__(self):
    return f"[Threshold_axis] : {self.threshold_.axis_[0]} [T_col] : {self.threshold_.col_} [T_unit] : {self.threshold_.unit_:.3f} [cost] : {self.cost_:.3f}"

  def get_depth(self):
    if self.is_root:
      return 0
    else: 
      return self.parent_.get_depth() + 1


### Test

In [26]:
x_train_n = x_train.to_numpy()
y_train_n = y_train.to_numpy()

model = TreeNode()
model.fit(x_train_n, y_train_n)
print(model)

[Threshold_axis] : 1 [T_col] : 1 [T_unit] : 0.000 [cost] : 0.333


In [27]:
model.add_left()
str(model.left_.parent_)

'[Threshold_axis] : 1 [T_col] : 1 [T_unit] : 0.000 [cost] : 0.333'

In [22]:
node1

<__main__.TreeNode at 0x7f1ef1af08d0>

In [15]:
x_left, x_right, y_left, y_right = model.predict(x_train_n, y = y_train_n)
(x_left[:, 1] == 0).sum() == len(x_left)

True

## Boost

In [None]:
# Linear을 추가
class Stump:
  def __init__(self, classifier):
    self.classifier_ = classifier

  def fit(self, x, y, w):
    d = self.classifier_.fit(x ,y)




In [11]:
class AdaBoost():
  