In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

class RandomForest:
    
    def __init__(self, n_estimators):
        self.base = []
        self.n_estimators = n_estimators
    
    def bootstrap_sample(self, X, y):
        len = X.shape[0]
        y = y.reshape(len,1)
        X_y = np.hstack((X,y))
        np.random.shuffle(X_y)
        
        dataset = []
        for t in range(self.n_estimators):
            idm = np.random.choice(len, len, replace=True) # 有放回
            bootstrap_X_y = X_y[idm,:]
            bootstrap_X =  bootstrap_X_y[:,:-1]
            bootstrap_y =  bootstrap_X_y[:,-1:]
            dataset.append([bootstrap_X,bootstrap_y])
        return dataset
    
    def fit(self, X, y):
        # 自助采样
        sample = self.bootstrap_sample(X, y)
        for t in range(self.n_estimators):
            X_sub, y_sub = sample[t]
            clf = DecisionTreeClassifier(max_features="log2", ccp_alpha=0.0001)
            clf.fit(X_sub, y_sub)
            self.base.append(clf)    
        return
                
    def predict(self, X):
        m = X.shape[0]
        votes = np.zeros((m, self.n_estimators)).astype(int)
        for j in range(self.n_estimators):
            votes[:,j] = self.base[j].predict(X)
        print(votes)    
        y_pred = np.zeros(m).astype(int)
        for i in range(m):
            (values,counts) = np.unique(votes[i, :],return_counts=True)
            y_pred[i] = values[counts.argmax()]
        print(y_pred)
        #return y_pred
    
    def predict_proba(self, X):
        m = X.shape[0]
        votes = np.zeros((m, self.n_estimators))
        for j in range(len(self.base)):
            votes[:,j] = self.base[j].predict_proba(X)[:,1]
        print(votes)
        prob = np.zeros(m)
        for i in range(m):
            prob[i] = votes[i, :].sum()/self.n_estimators
        print(prob)
        #return prob
    
    def score(self, X, y, sample_weight=None):
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)     
     

def crossValidation(X, y):
    kf = StratifiedKFold(n_splits=5)
    
    for t in range(1, 10):# test n_estimators
        AUC = 0
        #acc = 0
        for train_index, val_index in kf.split(X,y):
            train_data, val_data = X[train_index], X[val_index]
            train_label, val_label = y[train_index], y[val_index]
            clf = AdaBoost(n_estimators=t)
            clf.fit(train_data,train_label)
            pred = clf.predict(val_data)
            prob = clf.predict_proba(val_data)
            fp_rate, tp_rate, thresholds = roc_curve(val_label, prob)
            AUC += auc(fp_rate, tp_rate)#roc_auc_score(val_label, pred)
            #acc += accuracy_score(val_label, pred)
        AUC /= 5
        #val_acc /= 5
        #y.append(val_AUC)
        print("NUM = ",t," Valiadtion AUC = ", AUC)
       #print("NUM = ", NUM ," Validation acc = ", val_acc)
        
if __name__ == "__main__":
    # load data from source
    X_train = np.genfromtxt("adult_dataset/adult_train_feature.txt")
    X_test  = np.genfromtxt("adult_dataset/adult_test_feature.txt")
    y_train = np.genfromtxt("adult_dataset/adult_train_label.txt")
    y_test  = np.genfromtxt("adult_dataset/adult_test_label.txt")
    # preprocess
    y_train = (y_train - 0.5) * 2
    y_test = (y_test - 0.5) * 2
    # train & predict
    clf = RandomForest(n_estimators = 5)
    clf.fit(X_train, y_train)
    
    clf.predict(X_test)
    clf.predict_proba(X_test)
    #print(clf.score(X_test, y_test))

[[-1 -1 -1 -1 -1]
 [-1 -1 -1  1 -1]
 [-1 -1 -1 -1 -1]
 ...
 [ 1  1  1  1  1]
 [-1 -1 -1 -1  1]
 [ 1  1  1  1  1]]
[-1 -1 -1 ...  1 -1  1]
[[0.00758676 0.00931677 0.03252033 0.00227716 0.01078619]
 [0.31388013 0.19277108 0.20408163 0.57425743 0.31810491]
 [0.06501548 0.33870968 0.28205128 0.10497238 0.26195029]
 ...
 [0.83018868 0.82894737 0.76       0.96       0.83443709]
 [0.         0.         0.13333333 0.01342282 0.88888889]
 [0.82191781 0.79564033 0.7027027  0.8440367  0.69130435]]
[0.01249744 0.32061904 0.21053982 ... 0.84271463 0.20712901 0.77112038]
