<a href="https://colab.research.google.com/github/LEFT-BEE/small_project/blob/main/%EC%88%98%EC%B9%98%ED%95%B4%EC%84%9Dproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adaboost 
1. 각 weak 모델에서 학습할 데이터 선택
2. 모든 데이터의 가중치 초기화
3. 1회 학습 후 예측 오류(error)계산, 가중치(a)계산, 가중치(D)갱신
4. 반복 회수별로 가중치 갱신
5. 모든 모델이 위의 단계를 수행할때 까지 반복

가중치(D) : 모든 train 데이터에 적용(초기값 동일)

오류(e) : 오류데이터 / 전체 학습데이터 , 각 모델의 오류

모델별 가중치(a) : ln((1-e) / e) / 2 , 오류를 기반으로 계산

예측이 맞을 경우 -> $D_i^{t+1} = \frac {D_i^{t} \times e^{-a} }{Sum(D)} $

예측이 틀린 경우 -> $D_i^{t+1} = \frac {D_i^{t} \times e^{a} }{Sum(D)} $




In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2
import matplotlib.pyplot as plt

""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
    return sum(pred != Y) / float(len(Y))

""" HELPER FUNCTION: PRINT ERROR RATE ======================================="""
def print_error_rate(err):
    print ('Error rate: Training: %.4f - Test: %.4f' % err)

In [None]:
""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
    clf.fit(X_train,Y_train)
    pred_train = clf.predict(X_train)
    pred_test = clf.predict(X_test)
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [None]:
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
    n_train, n_test = len(X_train), len(X_test)
    # Initialize weights
    w = np.ones(n_train) / n_train
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
    
    for i in range(M):
        # Fit a classifier with the specific weights
        clf.fit(X_train, Y_train, sample_weight = w)
        pred_train_i = clf.predict(X_train)
        pred_test_i = clf.predict(X_test)
        # Indicator function
        miss = [int(x) for x in (pred_train_i != Y_train)]
        # Equivalent with 1/-1 to update weights
        miss2 = [x if x==1 else -1 for x in miss]
        # Error
        err_m = np.dot(w,miss) / sum(w)
        # Alpha
        alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
        # New weights
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
        # Add to prediction
        pred_train = [sum(x) for x in zip(pred_train, 
                                          [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, 
                                         [x * alpha_m for x in pred_test_i])]
              
    pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
    print("pred_train : " , pred_train , "pred_test : " , pred_test)
    # Return error rate in train and test set
    return get_error_rate(pred_train, Y_train), \
           get_error_rate(pred_test, Y_test)

In [None]:
""" PLOT FUNCTION ==========================================================="""
def plot_error_rate(er_train, er_test):
    df_error = pd.DataFrame([er_train, er_test]).T
    df_error.columns = ['Training', 'Test']
    plot1 = df_error.plot(linewidth = 3, figsize = (8,6),
            color = ['lightblue', 'darkblue'], grid = True)
    plot1.set_xlabel('Number of iterations', fontsize = 12)
    plot1.set_xticklabels(range(0,450,50))
    plot1.set_ylabel('Error rate', fontsize = 12)
    plot1.set_title('Error rate vs number of iterations', fontsize = 16)
    plt.axhline(y=er_test[0], linewidth=1, color = 'red', ls = 'dashed')

In [None]:
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
    
    # Read data
    #Hastie et al.에서 사용 된 2 진 분류를위한 데이터를 생성합니다.10 개의 피쳐는
    # 표준 독립적 인 가우시안이며 타겟 y은 다음에 의해 정의됩니다.
    #12000천개의 임의 데이터
    x, y = make_hastie_10_2()
    df = pd.DataFrame(x)
    df['Y'] = y

    # Split into training and test set
    train, test = train_test_split(df, test_size = 0.2)
    X_train, Y_train = train.iloc[:,:-1], train.iloc[:,-1]
    X_test, Y_test = test.iloc[:,:-1], test.iloc[:,-1]
    
    # Fit a simple decision tree first
    clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
    er_tree = generic_clf(Y_train, X_train, Y_test, X_test, clf_tree)
    
    # Fit Adaboost classifier using a decision tree as base estimator
    # Test with different number of iterations
    er_train, er_test = [er_tree[0]], [er_tree[1]]
  
    for i in range(10 , 30):    
        print(i-9,"번째 epoch")
        er_i = adaboost_clf(Y_train, X_train, Y_test, X_test, i, clf_tree)
        print_error_rate(er_i)
        er_train.append(er_i[0])
        er_test.append(er_i[1])
    
    # Compare error rate vs number of iterations
    plot_error_rate(er_train, er_test)

In [None]:
x, y = make_hastie_10_2()
len(x)