In [None]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

# 路徑設置
for dirname, _, filenames in os.walk('/kaggle/input/carinsuranceclaimprediction-classification'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 讀取 CSV 文件
csv_path = '/kaggle/input/carinsuranceclaimprediction-classification/train.csv'
trainData = pd.read_csv(csv_path, header=0)
# 刪除第一、十一、十二列的單元格
trainData = trainData.drop(columns=[trainData.columns[0], trainData.columns[10], trainData.columns[11]])

In [None]:
# 定義一個自定義函數，對非數字進行編碼
def encode_non_numeric(column):
    if column.dtype == 'object':
        return column.astype('category').cat.codes
    return column

# 將所有非數字的單元格進行編碼
trainData = trainData.apply(encode_non_numeric)

# 划分数据集
def split_dataset(data, test_size=0.2):
    # 随机打乱数据集
    data = data.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    # 计算测试集的大小
    test_data_size = int(test_size * len(data))
    
    # 划分数据集
    test_data = data[:test_data_size]
    train_data = data[test_data_size:]
    
    return train_data, test_data

In [None]:
# 定义节点类
class Node:
    def __init__(self, value=None, true_branch=None, false_branch=None, is_leaf=False):
        self.value = value
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.is_leaf = is_leaf

In [None]:
class DecisionTree:
    # 添加一个属性来保存每一次迭代的性能指标
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.best_gini = float('inf')
        self.best_criteria = None
        self.best_sets = None
        self.tree = None
        self.history = {'depth': [], 'train_accuracy': [], 'train_loss': [], 'test_accuracy': [], 'test_loss': []}

    def find_best_split(self, data):
        features = data.columns[:-1]
        self.best_gini = float('inf')  # 重置best_gini
        for feature in features:
            unique_values = data[feature].unique()

            for value in unique_values:
                true_data, false_data = self.split_data(data, feature, value)
                gini = (len(true_data) / len(data)) * self.calculate_gini(true_data) + \
                       (len(false_data) / len(data)) * self.calculate_gini(false_data)

                if gini < self.best_gini:
                    self.best_gini = gini
                    self.best_criteria = (feature, value)
                    self.best_sets = (true_data, false_data)

        return self.best_criteria, self.best_sets

    def build_tree(self, data, depth=0):
        if depth == self.max_depth or self.best_sets is None:
            return Node(value=data['is_claim'].mode()[0], is_leaf=True)

        best_criteria, best_sets = self.find_best_split(data)

        if self.best_gini == 0:
            return Node(value=best_sets[0]['is_claim'].mode()[0], is_leaf=True)

        true_branch = self.build_tree(best_sets[0], depth + 1)
        false_branch = self.build_tree(best_sets[1], depth + 1)

        return Node(value=best_criteria, true_branch=true_branch, false_branch=false_branch)

    def train(self, train_data, test_data=None):
        # 重构训练函数，根据每个深度构建树并评估性能
        for depth in range(1, self.max_depth + 1):
            self.tree = self.build_tree(train_data, depth)
            self.evaluate(train_data, test_data, depth)
            
            # 計算訓練集的準確度和損失
            train_predictions = self.predict(train_data.drop(columns=['is_claim']))
            train_labels = train_data['is_claim'].values
            train_accuracy = np.sum(train_predictions == train_labels) / len(train_labels)
            train_loss = self.calculate_gini(train_data[train_data['is_claim'] != train_predictions])
            
    def evaluate(self, train_data, test_data, depth):
        # 计算训练集的准确度和损失
        train_predictions = self.predict(train_data.drop(columns=['is_claim']))
        train_labels = train_data['is_claim'].values
        train_accuracy = np.sum(train_predictions == train_labels) / len(train_labels)
        train_loss = self.calculate_gini(train_data)

        # 保存训练集的性能
        self.history['depth'].append(depth)
        self.history['train_accuracy'].append(train_accuracy)
        self.history['train_loss'].append(train_loss)

        if test_data is not None:
            # 计算测试集的准确度和损失
            test_predictions = self.predict(test_data.drop(columns=['is_claim']))
            test_labels = test_data['is_claim'].values
            test_accuracy = np.sum(test_predictions == test_labels) / len(test_labels)
            test_loss = self.calculate_gini(test_data)

            # 保存测试集的性能
            self.history['test_accuracy'].append(test_accuracy)
            self.history['test_loss'].append(test_loss)
                
    # 添加一个方法来计算Gini不纯度作为损失
    def calculate_gini(self, data):
        if len(data) == 0:
            return 0
        else:
            proportions = data['is_claim'].value_counts(normalize=True)
            gini = 1 - sum(proportions ** 2)
            return gini

    # 预测单个样本
    def predict_sample(self, node, sample):
        if node.is_leaf:
            return node.value

        feature, value = node.value
        if sample[feature] <= value:
            return self.predict_sample(node.true_branch, sample)
        else:
            return self.predict_sample(node.false_branch, sample)

    # 预测数据集
    def predict(self, data):
        predictions = np.array([self.predict_sample(self.tree, row) for _, row in data.iterrows()])
        return predictions

In [None]:
# 划分数据集
trainData, testData = split_dataset(trainData)

print("Training set size:", trainData.shape)
print("Validation set size:", testData.shape)

# 使用上述决策树类进行训练和预测
tree = DecisionTree(max_depth=5) # 假设我们想要训练深度为1到5的树
tree.train(trainData, testData)

# 预测训练集数据
train_predictions = tree.predict(trainData.drop(columns=['is_claim']))
train_labels = trainData['is_claim'].values

# 计算训练集准确度和损失
train_accuracy = np.sum(train_predictions == train_labels) / len(train_labels)


# 预测测试集数据
test_predictions = tree.predict(testData.drop(columns=['is_claim']))
test_labels = testData['is_claim'].values

# 计算测试集准确度
test_accuracy = np.sum(test_predictions == test_labels) / len(test_labels)


In [None]:
# 繪製性能圖表
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(tree.history['depth'], tree.history['train_accuracy'], label='Train Accuracy')
plt.plot(tree.history['depth'], tree.history['test_accuracy'], label='Test Accuracy')
plt.xlabel('Depth of tree')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(tree.history['depth'], tree.history['train_loss'], label='Train Loss')
plt.plot(tree.history['depth'], tree.history['test_loss'], label='Test Loss')
plt.xlabel('Depth of tree')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

print(tree.history['train_loss'])
print(tree.history['test_loss'])
print(f'Average Testing Accuracy: {test_accuracy:.4f}')
print(f'Average Training Accuracy: {train_accuracy:.4f}')