## supervised learning
P76113028 鄭皓文

In [115]:
from abc import ABCMeta, abstractmethod
import numpy as np
import pandas as pd
from tqdm import tqdm
from imblearn.under_sampling  import RandomUnderSampler

## Data Processing

In [116]:
csv_path = '/kaggle/input/carinsuranceclaimprediction-classification/train.csv'
metaData = pd.read_csv(csv_path, header=0)

In [117]:
# 刪除第一、十一、十二列的單元格
metaData = metaData.drop(columns=[metaData.columns[0], metaData.columns[10], metaData.columns[11]])

# 定義一個自定義函數，對非數字進行編碼
def encode_non_numeric(column):
    if column.dtype == 'object':
        return column.astype('category').cat.codes
    return column

# 將所有非數字的單元格進行編碼
metaData = metaData.apply(encode_non_numeric)

In [118]:
# 自定义函数来划分数据集为训练集和验证集
def split_train_validation(data, test_ratio=0.2):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# 使用函数划分训练集和验证集
train_set, validation_set = split_train_validation(metaData, test_ratio=0.2)

#train_set = train_set.sample(frac=0.1)
#validation_set = validation_set.sample(frac=0.1)

# 显示结果
#print(f"Training set size: {(train_set)}")
#print(f"Validation set size: {len(validation_set)}")

print(train_set.shape)
print(validation_set.shape)

(46874, 41)
(11718, 41)


In [119]:
# 准备数据

ros = RandomUnderSampler(random_state=42)

X_train = train_set.drop('is_claim', axis=1)
y_train = train_set['is_claim']
X_train, y_train = ros.fit_resample(X_train, y_train)#处理不平衡


X_validation = validation_set.drop('is_claim', axis=1)
y_validation = validation_set['is_claim']
X_validation, y_validation = ros.fit_resample(X_validation, y_validation)#处理不平衡

In [120]:
# 数据标准化（重要）
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
X_validation = (X_validation - mean) / std

# Use RBF kernel to integrate with neural network models

****定义神经网络的基本方法****

In [121]:
class Net(metaclass=ABCMeta):

    # Neural network super class
    @abstractmethod
    def __init__(self):
        pass

    @abstractmethod
    def forward(self, X):
        pass

    @abstractmethod
    def backward(self, dout):
        pass

    @abstractmethod
    def get_params(self):
        pass

    @abstractmethod
    def set_params(self, params):
        pass

**定义线性层**

In [121]:
class Linear_Layer():

    def __init__(self, input_size, output_size, lr=0.0001):
        self.lr = lr
        self.cache = None
        self.w = {"val": np.random.normal(0.0, np.sqrt(2 / input_size), size=(input_size, output_size)),
                  "grad": 0}
        self.b = {"val": np.random.rand(output_size), "grad": 0}

    def forward(self, x, pred=False):
        x = np.array(x)
        out = np.dot(x, self.w["val"]) + self.b["val"]
        if not pred:
            self.cache = x
        return out

    def backward(self, dout):
        x = self.cache
        dx = np.dot(dout, self.w['val'].T).reshape(x.shape)


        self.w['grad'] = np.dot(x.reshape(x.shape[0], -1).T, dout)
        self.b['grad'] = np.sum(dout, axis=0)
        
        return dx

In [121]:
class Softmax():
    
    def __init__(self):
        self.cache = None

    def softmax(self, X):
        X_max = np.max(X, axis=1, keepdims=True)
        exp_X = np.exp(X - X_max)
        Z = exp_X / np.sum(exp_X, axis=1, keepdims=True)
        return Z

    def forward(self, X, pred=False):
        X = np.array(X, dtype=np.float32)
        X = np.clip(X, -30, 30)

        X -= np.max(X, axis=1, keepdims=True)
        means = np.mean(X, axis=1)
        means = means.reshape((means.shape[0], 1))

        Y = np.exp(X - means)
        Y = np.clip(Y, -30, 30)
        Z = Y / np.sum(Y, axis=1).reshape((Y.shape[0], 1))

        if not pred:
            self.cache = (X, Y, Z)

        return Z  # distribution

    def backward(self, dout):
        X, Y, Z = self.cache
        N, D = X.shape

        # Clip gradients of Z directly
        dZ = Z * (1 - Z)
        dZ = np.einsum('ij,ik->ijk', dZ, np.eye(D)) - np.einsum('ij,ik->ijk', Z, Z[:,:,np.newaxis])
        dZ = np.clip(dZ, -30, 30)

        M = np.eye(D)[np.argmax(Z, axis=1)]  # one-hot encoding matrix
        dY = np.eye(D) - M[np.newaxis, :, :]

        dX = np.dot(dout, dZ)
        dX = np.einsum('ij,ijk->ik', dX, dY)

        # Clip gradients of X directly
        dX = np.clip(dX, -30, 30)

        return dX

In [121]:
class ReLU():
    """
    ReLU activation layer
    """
    def __init__(self):
        self.cache = None

    def forward(self, X, pred=False):
        X = np.array(X, dtype=np.float32)
        out = np.maximum(0, X)

        if not pred:
            self.cache = X

        return out

    def backward(self, dout):
        X = self.cache
        dX = np.array(dout, dtype=np.float32)
        dX[X <= 0] = 0
        return dX

In [121]:
def NLLLoss(Y_pred, Y_true):
    """
    Negative log likelihood loss
    """

    loss = 0.0
    N = Y_pred.shape[0]
    M = np.sum(Y_pred * Y_true, axis=1)
    for e in M:
        if e == 0:
            loss += 500
        else:
            loss += -np.log(e)
    return loss / N

In [121]:
class CrossEntropyLoss():
    def __init__(self):
        pass

    def get(self, Y_pred, Y_true):
        Y_true = Y_true.reshape((-1,1))
        N = Y_pred.shape[0]
        softmax = Softmax()
        prob = softmax.forward(Y_pred)
        loss = NLLLoss(prob, Y_true)
        Y_serial = np.argmax(Y_true, axis=1)  # 這邊的進去的 Y 必須爲 （10，1）不能為（10，）
        dout = prob.copy()
        dout[np.arange(N), Y_true.astype(np.int32).reshape(1,-1)] -= 1
        return loss, dout

In [121]:
class SGD():
    def __init__(self, params, lr=0.0001, reg=0.001):
        self.parameters = params
        self.lr = lr
        self.reg = reg

    def step(self):
        for param in self.parameters:
            param['val'] -= (self.lr*param['grad'] + self.reg*param['val'])
            #param['grad'] = 0#将下一轮梯度值重设

In [122]:
def RBF(X, gamma):
    # Free parameter gamma
    if gamma == None:
        gamma = 1.0/X.shape[1]

    # RBF kernel Equation
    K = np.exp(-gamma * np.sum((X - X[:,np.newaxis])**2, axis=-1))

    return K

In [122]:
class Mlp(Net):

    def __init__(self,input_size,output_size = 2):
        self.FC1 = Linear_Layer(input_size, input_size)
        self.ReLU3 = ReLU()
        self.FC2 = Linear_Layer(input_size, int(input_size/2))
        self.Sigm4 = ReLU()
        self.FC3 = Linear_Layer(int(input_size/2), output_size)
        self.Softmax = Softmax()

   

    def forward(self,x,pred = False):

        # K = self.RBF(x,gamma=0.1) 
        x = np.clip(x, 1e-15, 1 - 1e-15)
        h3 = self.FC1.forward(x,pred)
        a3 = self.ReLU3.forward(h3,pred)
        h4 = self.FC2.forward(a3,pred)
        a4 = self.Sigm4.forward(h4,pred)
        h5 = self.FC3.forward(a4,pred)
        a5 = self.Softmax.forward(h5,pred)
        # if np.isnan(a5).any():
        #     # print(iii)
        #     print(a5)
        return a5 

    def backward(self,dout):

        dout = self.FC3.backward(dout)
        dout = self.Sigm4.backward(dout)
        dout = self.FC2.backward(dout)
        dout = self.ReLU3.backward(dout)
        dout = self.FC1.backward(dout)
        
    def get_params(self):
         return [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b, self.FC3.w, self.FC3.b]
    def set_params(self,params):
        [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b, self.FC3.w, self.FC3.b] = params

## Train neural network

In [123]:
def train_batch(file_inf, batch_size=20):
    for i in range(int(file_inf.shape[0] / batch_size)):
        if len(file_inf) < batch_size:
            break
        batch_inf = file_inf.sample(n=batch_size, replace=False, random_state=123)
        file_inf = file_inf.drop(index=batch_inf.index)

        batch = batch_inf.iloc[:, :-1]
        label = batch_inf.iloc[:, -1]

        yield batch, label, batch_size

def confusion_matrix(preds, labels):
    preds = np.int32(preds)
    labels = np.int32(labels)
    size = 2
    conf_matrix = np.zeros(size*size).reshape((size,size))
    # preds = np.argmax(preds, 1)
    for p, t in zip(preds, labels):
        conf_matrix[p-1, t-1] += 1
    return conf_matrix


In [124]:
# X_train_1 = RBF(X_train.values,gamma=0.1)
train_data = np.hstack((X_train,y_train.values.reshape(-1,1)))

In [125]:
batch_size = 100
epoches = 2

model = Mlp(input_size=100)
optim = SGD(model.get_params(), lr=0.0001, reg=0.001)
criterion = CrossEntropyLoss()

for epoche in tqdm(range(epoches),ncols=50):
    
    train_dataloader = train_batch(pd.DataFrame(train_data),batch_size=batch_size)
    train_acc = []
    vail_acc = []
    loss_lis = []

    for iii, (batch,label,_) in enumerate(train_dataloader):
        X,y = batch.values,label.values

        X = RBF(X,gamma=0.1)
        res_ = model.forward(x=X)
        pre_y = np.argmax(res_,axis = 1)
        loss, dout = criterion.get(res_, y)

        T_acc = sum(np.diag(
                confusion_matrix(pre_y, y))) / batch_size
        train_acc.append(T_acc)
        loss_lis.append(loss)

        model.backward(dout)
        optim.step()
        #optim.zero_grad()  # Reset gradients at the start of each batch
    
    print("-------" + "epoche" + str(epoche+1) + '/' + str(epoches) + "train_acc:" + str(np.mean(train_acc)) + " //loss:" + str(np.mean(loss_lis)))

 50%|███████▌       | 1/2 [00:00<00:00,  1.49it/s]

-------epoche1/2train_acc:0.49779661016949156 //loss:249.83050847415194


100%|███████████████| 2/2 [00:01<00:00,  1.50it/s]

-------epoche2/2train_acc:0.4996610169491526 //loss:249.83050847441467





In [126]:
model_param = model.get_params

In [127]:
for i in range(6):
    model_param()[i]['val'][np.isinf(np.abs(model_param()[i]['val']))] = 1

In [128]:
class Mlp_s(Net):

    def __init__(self,input_size,output_size = 2):
        self.FC1 = Linear_Layer(input_size, input_size)
        self.ReLU3 = ReLU()
        self.FC2 = Linear_Layer(input_size, int(input_size/2))
        self.Sigm4 = ReLU()
        
    def forward(self,x,pred = False):
        x = np.clip(x, 1e-15, 1 - 1e-15)
        h3 = self.FC1.forward(x,pred)
        a3 = self.ReLU3.forward(h3,pred)
        h4 = self.FC2.forward(a3,pred)
        a4 = self.Sigm4.forward(h4,pred)
        return a4 
    
    def backward(self, dout):
        dout = self.Sigm4.backward(dout)
        dout = self.FC2.backward(dout)
        dout = self.ReLU3.backward(dout)
        dout = self.FC1.backward(dout)
            
    def get_params(self):
         return [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b]
    
    def set_params(self,params):
        [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b] = params

In [129]:
nnet = Mlp_s(input_size=100)
nnet.set_params(params=model_param()[:4])

## Decision Tree 

In [130]:
class Node:
    def __init__(self, value=None, true_branch=None, false_branch=None, is_leaf=False):
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.is_leaf = is_leaf
    
    def get_description(self):
        if self.is_leaf:
            return f"Leaf Node: Predicts class {self.value}"
        else:
            return f"Decision Node: {self.value[0]} <= {self.value[1]}"

class DecisionTree:
    # 添加一个属性来保存每一次迭代的性能指标
    def __init__(self, y_,max_depth=None):
        self.y_name = y_
        self.max_depth = max_depth
        self.best_gini = float('inf')
        self.best_criteria = None
        self.best_sets = None
        self.tree = None
        self.history = {'depth': [], 'train_accuracy': [], 'train_loss': [], 'test_accuracy': [], 'test_loss': []}
        self.import_feature = set()
        
    def split_data(self, data, feature, value):
        
        true_data = data[data[feature] <= value]
        false_data = data[data[feature] > value]
        return true_data, false_data
    
        return train_data, test_data
    def find_best_split(self, data):
        features = data.columns[:-1]
        self.best_gini = float('inf')  # 重置best_gini
        for feature in features:
            unique_values = data[feature].unique()
#             print(len(unique_values))
            for value in tqdm(unique_values,total = len(unique_values),ncols=50):
                true_data, false_data = self.split_data(data, feature, value)
                gini = (len(true_data) / len(data)) * self.calculate_gini(true_data) + \
                       (len(false_data) / len(data)) * self.calculate_gini(false_data)
#                 print(feature)

                if gini < self.best_gini:
                    self.best_gini = gini
                    self.best_criteria = (feature, value)
                    self.best_sets = (true_data, false_data)
#                     print(self.best_criteria[0])
                    
                    if self.best_criteria[0] not in self.import_feature:
                        self.import_feature.add(self.best_criteria[0])
            
        
        return self.best_criteria, self.best_sets

    def build_tree(self, data, depth=0):
        if depth == self.max_depth :#or self.best_sets is None:
            return Node(value=data[self.y_name].iloc[0], is_leaf=True)
        
        else:
            self.best_criteria, self.best_sets = self.find_best_split(data)
            if self.best_gini == 0:
                return Node(value=self.best_sets[0][self.y_name].mode()[0], is_leaf=True)
            if self.best_sets[0].shape[0] or self.best_sets[1].shape[0] <100:
                return Node(value=self.best_sets[0][self.y_name].mode()[0], is_leaf=True)
            true_branch = self.build_tree(self.best_sets[0], depth + 1)
            false_branch = self.build_tree(self.best_sets[1], depth + 1)
            print(depth)

            return Node(value=self.best_criteria, true_branch=true_branch, false_branch=false_branch)
    
    def prune_tree(self, X_val, y_val, node=None):
        if node is None:
            node = self.tree

        # 如果当前节点是叶节点，则不需要剪枝
        if node.is_leaf:
            return
        
        # 如果当前节点有子节点，先尝试剪枝子节点
        if node.true_branch:
            self.prune_tree(X_val, y_val, node.true_branch)
        if node.false_branch:
            self.prune_tree(X_val, y_val, node.false_branch)
        
        # 检查当前节点如果被剪枝，是否能提高验证集的准确率
        before_prune_accuracy = self.eva(X_val, y_val)
        
        # 将当前节点变为叶节点
        true_backup, false_backup = node.true_branch, node.false_branch
        node.true_branch, node.false_branch = None, None
        node.is_leaf = True
        node.value = self.majority_class(y_val)
        
        after_prune_accuracy = self.eva(X_val, y_val)
        
        # 如果剪枝后准确率下降，撤销剪枝
        if after_prune_accuracy < before_prune_accuracy:
            node.true_branch, node.false_branch = true_backup, false_backup
            node.is_leaf = False
    
    
    def train(self, train_data, test_data=None):
        # 重构训练函数，根据每个深度构建树并评估性能
        for depth in range(1, self.max_depth + 1):
            self.tree = self.build_tree(train_data, depth)
            self.evaluate(train_data, test_data, depth)
            
            # 計算訓練集的準確度和損失
            train_predictions = self.predict(train_data.drop(columns=[self.y_name]))
            train_labels = train_data[self.y_name].values
            train_accuracy = np.sum(train_predictions == train_labels) / len(train_labels)
            train_loss = self.calculate_gini(train_data[train_data[self.y_name] != train_predictions])
            
    def evaluate(self, train_data, test_data, depth):
        # 计算训练集的准确度和损失
        train_predictions = self.predict(train_data.drop(columns=[self.y_name]))
        train_labels = train_data[self.y_name].values
        train_accuracy = np.sum(train_predictions == train_labels) / len(train_labels)
        train_loss = self.calculate_gini(train_data)

        # 保存训练集的性能
        self.history['depth'].append(depth)
        self.history['train_accuracy'].append(train_accuracy)
        self.history['train_loss'].append(train_loss)

        if test_data is not None:
            # 计算测试集的准确度和损失
            test_predictions = self.predict(test_data.drop(columns=[self.y_name]))
            test_labels = test_data[self.y_name].values
            test_accuracy = np.sum(test_predictions == test_labels) / len(test_labels)
            test_loss = self.calculate_gini(test_data)

            # 保存测试集的性能
            self.history['test_accuracy'].append(test_accuracy)
            self.history['test_loss'].append(test_loss)
                
    def eva(self, X_val, y_val):
        predictions = self.predict(X_val)
        prune_accuracy = np.sum(predictions == y_val) / len(y_val)
        return prune_accuracy

    def majority_class(self, y):
        values, counts = np.unique(y, return_counts=True)
        max_count_index = np.argmax(counts)
        return values[max_count_index]

    
    # 添加一个方法来计算Gini不纯度作为损失
    def calculate_gini(self, data):
        if len(data) == 0:
            return 0
        else:
            proportions = data[self.y_name].value_counts(normalize=True)
            gini = 1 - sum(proportions ** 2)
            return gini
    
    def calculate_loss(self, predictions, labels):
        # 计算二分类的损失，这里用简单的错误率来表示损失
        loss = np.sum(predictions != labels) / float(len(labels))
        return loss
    

    # 预测单个样本
    def predict_sample(self, node, sample):
        if node.is_leaf:
            return node.value

        feature, value = node.value
        if sample[feature] <= value:
            return self.predict_sample(node.true_branch, sample)
        else:
            return self.predict_sample(node.false_branch, sample)

    # 预测数据集
    def predict(self, data):
        predictions = np.array([self.predict_sample(self.tree, row) for _, row in data.iterrows()])
        return predictions
    

In [131]:
n_flod = 5
num=1
model_list = []
for flod in (range(n_flod)):
    sample_list = []
    train_dataloader = train_batch(pd.DataFrame(train_data),batch_size=batch_size)
    tree = DecisionTree(y_= 50 ,max_depth=3)

    for iii, (batch,label,_) in enumerate(train_dataloader):
        X,y = batch.values,label.values
        X = RBF(X,gamma=0.1)
        res_ = nnet.forward(x=X)
        temp = np.hstack((res_,y.reshape([-1,1])))
        sample_list.append(temp)
         
    if sample_list:  # Check if sample_list is not empty
        sample = np.vstack(sample_list)
        print("——train decision tree——",num)
        tree.train(train_data=pd.DataFrame(sample),test_data=None)
        model_list.append(tree)
        num+=1

——train decision tree—— 1


100%|████████| 5751/5751 [00:18<00:00, 317.65it/s]
100%|████████| 5892/5892 [00:18<00:00, 317.70it/s]
100%|████████| 5684/5684 [00:18<00:00, 314.41it/s]
100%|████████| 2236/2236 [00:06<00:00, 322.31it/s]
100%|██████████| 533/533 [00:01<00:00, 331.04it/s]
100%|██████████| 322/322 [00:01<00:00, 320.70it/s]
100%|████████| 4041/4041 [00:12<00:00, 312.95it/s]
100%|████████| 5892/5892 [00:17<00:00, 327.83it/s]
100%|████████| 5865/5865 [00:18<00:00, 324.06it/s]
100%|████████| 5893/5893 [00:18<00:00, 325.24it/s]
100%|████████| 5893/5893 [00:18<00:00, 321.62it/s]
100%|████████| 5782/5782 [00:17<00:00, 324.66it/s]
100%|████████| 4066/4066 [00:12<00:00, 317.12it/s]
100%|████████| 5618/5618 [00:17<00:00, 316.42it/s]
100%|████████| 5895/5895 [00:18<00:00, 323.31it/s]
100%|████████| 5852/5852 [00:17<00:00, 325.78it/s]
100%|████████| 5774/5774 [00:18<00:00, 320.16it/s]
100%|████████| 1762/1762 [00:05<00:00, 331.99it/s]
100%|████████| 5336/5336 [00:16<00:00, 321.13it/s]
100%|████████| 1057/1057 [00:03

——train decision tree—— 2


100%|████████| 5751/5751 [00:18<00:00, 313.65it/s]
100%|████████| 5892/5892 [00:18<00:00, 319.02it/s]
100%|████████| 5684/5684 [00:17<00:00, 323.61it/s]
100%|████████| 2236/2236 [00:06<00:00, 327.55it/s]
100%|██████████| 533/533 [00:01<00:00, 333.81it/s]
100%|██████████| 322/322 [00:00<00:00, 341.23it/s]
100%|████████| 4041/4041 [00:12<00:00, 318.10it/s]
100%|████████| 5892/5892 [00:18<00:00, 318.37it/s]
100%|████████| 5865/5865 [00:18<00:00, 323.18it/s]
100%|████████| 5893/5893 [00:18<00:00, 322.20it/s]
100%|████████| 5893/5893 [00:18<00:00, 316.13it/s]
100%|████████| 5782/5782 [00:18<00:00, 320.39it/s]
100%|████████| 4066/4066 [00:12<00:00, 317.61it/s]
100%|████████| 5618/5618 [00:17<00:00, 321.05it/s]
100%|████████| 5895/5895 [00:18<00:00, 314.24it/s]
100%|████████| 5852/5852 [00:18<00:00, 320.03it/s]
100%|████████| 5774/5774 [00:18<00:00, 318.63it/s]
100%|████████| 1762/1762 [00:05<00:00, 323.23it/s]
100%|████████| 5336/5336 [00:16<00:00, 317.12it/s]
100%|████████| 1057/1057 [00:03

——train decision tree—— 3


100%|████████| 5751/5751 [00:17<00:00, 322.45it/s]
100%|████████| 5892/5892 [00:18<00:00, 320.80it/s]
100%|████████| 5684/5684 [00:18<00:00, 308.78it/s]
100%|████████| 2236/2236 [00:06<00:00, 332.32it/s]
100%|██████████| 533/533 [00:01<00:00, 334.94it/s]
100%|██████████| 322/322 [00:00<00:00, 336.93it/s]
100%|████████| 4041/4041 [00:13<00:00, 297.01it/s]
100%|████████| 5892/5892 [00:18<00:00, 313.49it/s]
100%|████████| 5865/5865 [00:18<00:00, 319.23it/s]
100%|████████| 5893/5893 [00:18<00:00, 326.27it/s]
100%|████████| 5893/5893 [00:18<00:00, 322.99it/s]
100%|████████| 5782/5782 [00:18<00:00, 318.82it/s]
100%|████████| 4066/4066 [00:12<00:00, 323.22it/s]
100%|████████| 5618/5618 [00:17<00:00, 326.49it/s]
100%|████████| 5895/5895 [00:18<00:00, 317.27it/s]
100%|████████| 5852/5852 [00:18<00:00, 317.75it/s]
100%|████████| 5774/5774 [00:17<00:00, 323.12it/s]
100%|████████| 1762/1762 [00:05<00:00, 333.89it/s]
100%|████████| 5336/5336 [00:16<00:00, 321.75it/s]
100%|████████| 1057/1057 [00:03

——train decision tree—— 4


100%|████████| 5751/5751 [00:18<00:00, 314.32it/s]
100%|████████| 5892/5892 [00:18<00:00, 320.33it/s]
100%|████████| 5684/5684 [00:18<00:00, 310.69it/s]
100%|████████| 2236/2236 [00:07<00:00, 311.02it/s]
100%|██████████| 533/533 [00:01<00:00, 340.50it/s]
100%|██████████| 322/322 [00:00<00:00, 336.34it/s]
100%|████████| 4041/4041 [00:12<00:00, 319.46it/s]
100%|████████| 5892/5892 [00:18<00:00, 318.50it/s]
100%|████████| 5865/5865 [00:18<00:00, 318.43it/s]
100%|████████| 5893/5893 [00:18<00:00, 319.25it/s]
100%|████████| 5893/5893 [00:18<00:00, 320.80it/s]
100%|████████| 5782/5782 [00:18<00:00, 314.93it/s]
100%|████████| 4066/4066 [00:13<00:00, 312.67it/s]
100%|████████| 5618/5618 [00:17<00:00, 323.02it/s]
100%|████████| 5895/5895 [00:18<00:00, 318.33it/s]
100%|████████| 5852/5852 [00:18<00:00, 320.94it/s]
100%|████████| 5774/5774 [00:18<00:00, 319.29it/s]
100%|████████| 1762/1762 [00:05<00:00, 332.14it/s]
100%|████████| 5336/5336 [00:16<00:00, 321.80it/s]
100%|████████| 1057/1057 [00:03

——train decision tree—— 5


100%|████████| 5751/5751 [00:17<00:00, 325.44it/s]
100%|████████| 5892/5892 [00:17<00:00, 333.23it/s]
100%|████████| 5684/5684 [00:17<00:00, 333.43it/s]
100%|████████| 2236/2236 [00:06<00:00, 345.42it/s]
100%|██████████| 533/533 [00:01<00:00, 350.49it/s]
100%|██████████| 322/322 [00:00<00:00, 362.64it/s]
100%|████████| 4041/4041 [00:12<00:00, 332.09it/s]
100%|████████| 5892/5892 [00:18<00:00, 325.01it/s]
100%|████████| 5865/5865 [00:18<00:00, 321.15it/s]
100%|████████| 5893/5893 [00:18<00:00, 326.06it/s]
100%|████████| 5893/5893 [00:17<00:00, 328.04it/s]
100%|████████| 5782/5782 [00:17<00:00, 327.63it/s]
100%|████████| 4066/4066 [00:12<00:00, 330.28it/s]
100%|████████| 5618/5618 [00:17<00:00, 320.65it/s]
100%|████████| 5895/5895 [00:18<00:00, 324.97it/s]
100%|████████| 5852/5852 [00:18<00:00, 315.32it/s]
100%|████████| 5774/5774 [00:18<00:00, 319.65it/s]
100%|████████| 1762/1762 [00:05<00:00, 295.75it/s]
100%|████████| 5336/5336 [00:16<00:00, 323.39it/s]
100%|████████| 1057/1057 [00:03

In [132]:
valid_data = np.hstack((X_validation.values, y_validation.values.reshape(-1,1))) 
valid_dataloader = train_batch(pd.DataFrame(valid_data),batch_size=batch_size)
T_acc_l = []
for iii, (batch,label,_) in enumerate(valid_dataloader):
        
        pred_list = []
        X,y = batch.values,label.values

        X = RBF(X,gamma=0.1)
        temp = nnet.forward(x=X)

        for mtree in model_list:
                
                res = mtree.predict(pd.DataFrame(X))
                pred_list.append(res)

        vote_res = np.mean(np.stack(pred_list),axis=0)
        vote_res = np.where(vote_res > 0.5, 1, 0)

        T_acc = sum(np.diag(
                    confusion_matrix(vote_res, y))) / batch_size
        T_acc_l.append(T_acc)
print(np.mean(np.asarray(T_acc_l)))


0.49666666666666665


## Bagging and MLP

In [133]:

def bagging(dataset, sample_num, dataset_size):
    for _ in range(sample_num):
        sub = dataset.sample(n=dataset_size)  # 使用 Pandas 的 sample 函數
        yield sub

In [134]:
class Mlp_t(Net):

    def __init__(self,input_size,output_size = 2):
        self.FC1 = Linear_Layer(input_size, 20)
        self.ReLU3 = ReLU()
        self.FC2 = Linear_Layer(20, int(output_size))
        self.Softmax = Softmax()

        
   

    def forward(self,x,pred = False):

        # K = self.RBF(x,gamma=0.1) 
        x = np.clip(x, 1e-15, 1 - 1e-15)
        h3 = self.FC1.forward(x,pred)
        a3 = self.ReLU3.forward(h3,pred)
        h4 = self.FC2.forward(a3,pred)
        a4 = self.Softmax.forward(h4,pred)
        
        return a4 
    def backward(self, dout):

        # dout = self.Sigm4.backward(dout)
        dout = self.FC2.backward(dout)
        dout = self.ReLU3.backward(dout)
        dout = self.FC1.backward(dout)

            
    def get_params(self):
         return [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b]
    def set_params(self,params):
        [self.FC1.w, self.FC1.b, self.FC2.w, self.FC2.b] = params

In [135]:
batch_size = 100
n_flod = 10
epoches = 1
MLP_list = []

begg_sample = bagging(dataset = pd.DataFrame(train_data),sample_num = n_flod,dataset_size=3000)

for bsd in tqdm(begg_sample):

    model = Mlp_t(input_size=40)
    optim = SGD(model.get_params(), lr=0.0001, reg=0.001)
    criterion = CrossEntropyLoss()


    for epoche in (range(epoches)):
        
        train_dataloader = train_batch(bsd,batch_size=batch_size)
        

        for iii, (batch,label,_) in enumerate(train_dataloader):
            X,y = batch.values,label.values

            # X = RBF(X,gamma=0.1)
            res_ = model.forward(x=X)
           
            pre_y = np.argmax(res_,axis = 1)
            loss, dout = criterion.get(res_, y)

            T_acc = sum(np.diag(
                    confusion_matrix(pre_y, y))) / batch_size
            # print(loss)
            train_acc.append(T_acc)
            loss_lis.append(loss)


            train_acc.append(T_acc)
            model.backward(dout)
            optim.step()

        print("-------" + "epoche" + str(epoche+1) + '/' + str(epoches) + "train_acc:" + str(np.mean(train_acc)) + " //loss:" + str(np.mean(loss_lis)))
        
    MLP_list.append(model)


1it [00:00,  9.46it/s]

-------epoche1/1train_acc:0.49478991596638666 //loss:250.56179775261475


2it [00:00,  9.49it/s]

-------epoche1/1train_acc:0.5022346368715083 //loss:251.5966386552318


3it [00:00,  9.49it/s]

-------epoche1/1train_acc:0.4919665271966528 //loss:251.14093959695344


4it [00:00,  9.61it/s]

-------epoche1/1train_acc:0.49023411371237463 //loss:251.08938547467383


5it [00:00,  9.67it/s]

-------epoche1/1train_acc:0.4929805013927576 //loss:251.26794258363367


6it [00:00,  9.69it/s]

-------epoche1/1train_acc:0.4951789976133652 //loss:251.40167363995914


7it [00:00,  9.73it/s]

-------epoche1/1train_acc:0.49741127348643005 //loss:251.20817843847226


8it [00:00,  9.80it/s]

-------epoche1/1train_acc:0.5003339517625232 //loss:250.43478260861497
-------epoche1/1train_acc:0.49889816360600997 //loss:249.87841945283955


10it [00:01,  9.72it/s]

-------epoche1/1train_acc:0.49808801213960546 //loss:249.80501392755414





In [136]:
valid_data = np.hstack((X_validation.values, y_validation.values.reshape(-1,1))) 
valid_dataloader = train_batch(pd.DataFrame(valid_data),batch_size=batch_size)
T_acc_l = []
for iii, (batch,label,_) in enumerate(valid_dataloader):
    pred_list = []
    X,y = batch.values,label.values

    for mlp in MLP_list:
        res_ = mlp.forward(x=X)
        pred_list.append(res_)

    vote_res = np.mean(np.stack(pred_list),axis=0)
    vote_res = np.argmax(vote_res, axis=1)

    T_acc = sum(np.diag(
            confusion_matrix(vote_res, y))) / batch_size
    T_acc_l.append(T_acc)
print(np.mean(np.asarray(T_acc_l)))

0.5306666666666666
