In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:

import numpy as np
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def dsigmoid(y):
    return y * (1 - y)
def tanh(x):
    return np.tanh(x)
def dtanh(y):
    return 1.0 - y ** 2
class MLPClassifier:
    def __init__(self, layers, activation='tanh', epochs=20, learning_rate=0.01):
        # 第一层layers[0]为输入层, 层数应等于样本特征数
        # 最后一层为输出层
        # 再算上中间层, 所以len(layers)最小为3
        self.epochs = epochs
        self.eta = learning_rate
        self.layers = [np.zeros(layers[0])]
        self.weights = []
        self.biases = []
        for i in range(len(layers) - 1):
            # 随机初始化
            weight = np.random.random((layers[i + 1], layers[i]))
            layer = np.ones(layers[i + 1])
            bias = np.random.random(layers[i + 1])
            self.weights.append(weight)
            self.layers.append(layer)
            self.biases.append(bias)
        if activation == 'tanh':
            self.activation = tanh
            self.dactivation = dtanh
        elif activation == 'sigmoid':
            self.activation = sigmoid
            self.dactivation = dsigmoid

    def fit(self, X, y):
        for _ in range(self.epochs):
            # 随机梯度下降
            indexes = np.random.permutation(X.shape[0])
            for i in range(X.shape[0]):
                self.forward(X[indexes[i]])
                self.backward(y[indexes[i]])
        return self

    def predict(self, X):
	# binary classification
        return np.where(self.predict_prob(X) >= 0.5, 1, 0)

    def predict_prob(self, X):
        y = np.empty((X.shape[0], len(self.layers[-1])))
        for i in range(X.shape[0]):
            self.forward(X[i])
            y[i, :] = self.layers[-1]
        return y

    def forward(self, inputs):
        self.layers[0][:] = inputs
        for i in range(len(self.weights)):
            self.layers[i + 1] = self.activation(self.weights[i].dot(self.layers[i]) + self.biases[i])

    def backward(self, y):
        # y 是真实的标签值
        y_predict = self.layers[-1] # y_predict即最后一层softmax的输出值
        gradient_neurons = y - y_predict  # softmax 的导数
        # 从最后一层到第一层进行遍历, 第0层是输入层, 不在遍历范围内
        for i in range(len(self.layers) - 1, 0, -1):
            gradient_bias = gradient_neurons
            # 最后一层无激活函数
            if i < len(self.layers):
                 gradient_bias *= self.dactivation(self.layers[i])
            gradient_weight = gradient_bias.reshape(-1, 1).dot(self.layers[i - 1].reshape(1, -1))  # weight的梯度是个矩阵
            gradient_neurons = gradient_bias.dot(self.weights[i - 1])  # 隐层值的梯度
            self.weights[i - 1] += self.eta * gradient_weight
            self.biases[i - 1] += self.eta * gradient_bias
            # self.layers中的内容无需更新

class DeepRandomForest:
    def __init__(self, n_estimators=10, max_features=None, max_samples=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_samples = max_samples
        self.models = []

    def fit(self, X, y):
        n_features = X.shape[1]
        n_samples = X.shape[0]

        for _ in range(self.n_estimators):
            # Randomly select features and samples
            selected_features = np.random.choice(range(n_features), self.max_features, replace=False)
            selected_samples = np.random.choice(range(n_samples), self.max_samples, replace=False)

            # Create Simple MLP classifier
            clf = MLPClassifier(layers = [len(selected_features),200,100,1])
            
            # Fit on selected features and samples
            clf.fit(X[selected_samples][:, selected_features], y[selected_samples])
            self.models.append((clf, selected_features))

        return self

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.models)))
        
        for i, (clf, selected_features) in enumerate(self.models):
            predictions[:, i] = clf.predict(X[:, selected_features]).flatten()
        
        return predictions

    def ensemble_predictions(self, X, y_val=None):
        ensemble_preds = self.predict(X)
        
        # Simple averaging of predictions
        final_predictions = np.mean(ensemble_preds, axis=1)
        
        # Weighted averaging based on performance on a validation set
        if y_val is not None:
            val_accuracy = []
            for preds in ensemble_preds.T:
                acc = accuracy_score(y_val, preds)
                val_accuracy.append(acc)
            
            weights = np.array(val_accuracy) / sum(val_accuracy)
            final_predictions = np.dot(ensemble_preds, weights)
        
        return np.round(final_predictions)


In [4]:
import random
import data_processing as df

def split_data(X, y, train_ratio=0.8, random_seed=42):
    random.seed(random_seed)

    data_size = len(X)
    index = list(range(data_size))
    random.shuffle(index)

    train_size = int(data_size*train_ratio)
    test_size = data_size - train_size

    X_train = X.iloc[index[:train_size]]
    y_train = y.iloc[index[:train_size]]
    X_test = X.iloc[index[train_size:]]
    y_test = y.iloc[index[train_size:]]

    return X_train, X_test, y_train, y_test

X = df.df.drop("is_claim", axis=1)
y = df.df.loc[:, "is_claim"]

X_train, X_test, y_train, y_test = split_data(X, y)
X_tra, X_val, y_tra, y_val = split_data(X_train, y_train)

print(X_tra.shape)
print(X_val.shape)

print(y_tra.shape)
print(y_val.shape)

is_claim
0    54844
1     3748
Name: count, dtype: int64
(109688, 87)
is_claim
0    54844
1    54844
Name: count, dtype: int64
(70200, 86)
(17550, 86)
(70200,)
(17550,)


In [7]:
# Creating Deep Random Forest
deep_rf = DeepRandomForest(n_estimators=10, max_features=86, max_samples=20000)
deep_rf.fit(X_tra.values, y_tra.values)

# Evaluating the ensemble on test data
predictions = deep_rf.ensemble_predictions(X_val.values, y_val=y_val.values)
accuracy = accuracy_score(y_val.values, predictions)
print(f"Accuracy of validation set: {accuracy}")

Accuracy of validation set: 0.4985754985754986


In [8]:
# Creating Deep Random Forest
deep_rf = DeepRandomForest(n_estimators=10, max_features=86, max_samples=20000)
deep_rf.fit(X_train.values, y_train.values)

# Evaluating the ensemble on test data
predictions = deep_rf.ensemble_predictions(X_test.values, y_val=y_test.values)
accuracy = accuracy_score(y_test.values, predictions)
print(f"Accuracy of test set: {accuracy}")

Accuracy of test set: 0.5002734980399307
