In [55]:
import pandas as pd
import math
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [56]:
data = pd.read_csv('titanic.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# 处理缺失值
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
# data['Fare'].fillna(data['Fare'].mean(), inplace=True)

# 转换分类特征
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

x = data[features].values.tolist()
y = data[target].values.tolist()

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [57]:
# 计算熵
def entropy(data):
    label_counts = Counter(data)
    total_count = len(data)
    return -sum(
        (count / total_count) * math.log2(count / total_count)
        for count in label_counts.values()
    )


# 按特征划分数据集
def split_dataset(data, labels, axis, value):
    ret_dataset = []
    ret_labels = []
    for row, label in zip(data, labels):
        if row[axis] == value:
            reduced_row = row[:axis] + row[axis + 1 :]
            ret_dataset.append(reduced_row)
            ret_labels.append(label)
    return ret_dataset, ret_labels


# 选择最佳划分特征
def choose_best_feature_to_split(data, labels):
    num_features = len(data[0])
    base_entropy = entropy(labels)
    best_info_gain = 0.0
    best_feature = -1
    for i in range(num_features):
        unique_vals = set(row[i] for row in data)
        new_entropy = 0.0
        for value in unique_vals:
            subset, subset_labels = split_dataset(data, labels, i, value)
            prob = len(subset) / float(len(data))
            new_entropy += prob * entropy(subset_labels)
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = i
    return best_feature, best_info_gain


# 多数表决
def majority_cnt(class_list):
    return Counter(class_list).most_common(1)[0][0]


# 递归创建决策树，包含预剪枝
def create_tree(
    data, labels, feature_names, max_depth=None, min_info_gain=0.01, depth=0
):
    class_list = labels
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]
    if len(data[0]) == 0:
        return majority_cnt(class_list)
    if max_depth is not None and depth >= max_depth:
        return majority_cnt(class_list)
    best_feature, best_info_gain = choose_best_feature_to_split(data, labels)
    if best_info_gain < min_info_gain:
        return majority_cnt(class_list)
    best_feature_name = feature_names[best_feature]
    tree = {best_feature_name: {}}
    unique_vals = set(row[best_feature] for row in data)
    for value in unique_vals:
        sub_feature_names = (
            feature_names[:best_feature] + feature_names[best_feature + 1 :]
        )
        subset, subset_labels = split_dataset(data, labels, best_feature, value)
        tree[best_feature_name][value] = create_tree(
            subset,
            subset_labels,
            sub_feature_names,
            max_depth,
            min_info_gain,
            depth + 1,
        )
    return tree


# 预测函数
def predict(tree, feature_names, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    child = tree[root]
    feature_index = feature_names.index(root)
    feature_value = sample[feature_index]
    
    if feature_value in child:
        return predict(child[feature_value], feature_names, sample)
    else:
        subtree_values = [v for v in child.values() if not isinstance(v, dict)]
        
        if subtree_values:
            return majority_cnt(subtree_values)
        else:
            return majority_cnt(child)


# 后剪枝函数
def post_prune(tree, validation_data, validation_labels, feature_names):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    subtrees = tree[root]
    feature_index = feature_names.index(root)
    for key in subtrees:
        if isinstance(subtrees[key], dict):
            subtrees[key] = post_prune(
                subtrees[key], validation_data, validation_labels, feature_names
            )

    # 计算未剪枝的误差
    unpruned_error = sum(
        predict(tree, feature_names, sample) != label
        for sample, label in zip(validation_data, validation_labels)
    )

    # 计算剪枝后的误差
    majority_class = majority_cnt(validation_labels)
    pruned_error = sum(majority_class != label for label in validation_labels)

    # 如果剪枝后的误差更小，则进行剪枝
    if pruned_error <= unpruned_error:
        return majority_class
    else:
        return tree

In [58]:
# 创建决策树并预剪枝
feature_names = features[:]
trn_data = x[: int(0.8 * len(x))]
trn_labels = y[: int(0.8 * len(y))]
val_data = x[int(0.8 * len(x)) :]
val_labels = y[int(0.8 * len(y)) :]

decision_tree = create_tree(
    trn_data,
    trn_labels,
    feature_names,
    max_depth=5,
    min_info_gain=0.01,
)

# 后剪枝
decision_tree = post_prune(
    decision_tree, val_data, val_labels, feature_names
)

print(decision_tree)

{'Fare': {0.0: 0, 512.3292: 1, 4.0125: 0, 6.975: 0, 7.925: 0, 8.05: 0, 7.25: {'Sex': {0.0: 1, 1.0: 0}}, 8.4583: 0, 11.1333: 1, 7.8542: 0, 13.0: 0, 7.225: 0, 8.0292: 1, 16.7: 1, 16.0: 1, 18.0: 0, 11.2417: 1, 15.5: 0, 21.075: 0, 21.0: 0, 21.6792: 0, 17.8: 0, 20.575: 0, 26.55: 0, 26.0: 0, 27.7208: {'Sex': {0.0: 1, 1.0: 0}}, 29.125: 0, 30.0708: {'Sex': {0.0: 1, 1.0: 0}}, 31.275: 0, 31.3875: 0, 27.75: {'Sex': {0.0: 1, 1.0: 0}}, 27.9: 0, 35.5: 0, 7.75: 0, 29.0: 1, 34.375: 0, 39.6875: 0, 34.6542: 0, 41.5792: {'Sex': {0.0: 1, 1.0: 0}}, 7.1417: 1, 36.75: 0, 7.7958: {'Age': {18.0: 0, 21.0: 1, 22.0: 0, 24.0: 0, 27.0: 1}}, 7.3125: 0, 46.9: 0, 9.0: 0, 9.5: 0, 47.1: 0, 50.0: 0, 51.8625: {'Sex': {0.0: 1, 1.0: 0}}, 10.5: 0, 53.1: {'Sex': {0.0: 1, 1.0: 0}}, 52.0: {'Sex': {0.0: 1, 1.0: 0}}, 55.0: 1, 56.4958: 0, 11.5: 0, 52.5542: 1, 56.9292: 1, 57.9792: 1, 61.9792: 0, 61.175: 0, 63.3583: 1, 61.3792: 0, 12.875: 0, 66.6: {'Sex': {0.0: 1, 1.0: 0}}, 13.5: {'Sex': {0.0: 1, 1.0: 0}}, 12.0: 1, 69.55: 0, 69.3: 1

In [59]:
# 评估模型
def evaluate(tree, feature_names, X, y):
    predictions = [predict(tree, feature_names, sample) for sample in X]
    accuracy = sum(1 for true, pred in zip(y, predictions) if true == pred) / len(y)
    return predictions, accuracy

# 评估剪枝后的决策树
_, accuracy = evaluate(decision_tree, feature_names, val_data, val_labels)
print(_[:5])
print(f"Accuracy: {accuracy:.2f}")

[0, 0, 0, 0, 1]
Accuracy: 0.79


In [60]:
# 读取测试数据
test_pd = pd.read_csv("test.csv")

# 处理缺失值
test_pd["Age"].fillna(data["Age"].mean(), inplace=True)
test_pd["Embarked"].fillna(data["Embarked"].mode()[0], inplace=True)
test_pd["Fare"].fillna(data["Fare"].mean(), inplace=True)

# 转换分类特征
test_pd["Sex"] = label_encoder.fit_transform(test_pd["Sex"])
test_pd["Embarked"] = label_encoder.fit_transform(test_pd["Embarked"].astype(str))

# 准备测试数据
val_x = test_pd[features].values.tolist()
print(trn_data[:5])
print(val_x[:5])


# 预测函数
def predict_survived(val_x, decision_tree, feature_names):
    return [predict(decision_tree, feature_names, sample) for sample in val_x]

# 进行预测
predictions = predict_survived(val_x, decision_tree, feature_names)
print(predictions)

# 将预测结果保存到submission.csv中
submission = pd.read_csv("submission.csv")
submission["Survived"] = predictions
submission.to_csv("submission.csv", index=False)

[[3.0, 1.0, 22.0, 1.0, 0.0, 7.25, 2.0], [1.0, 0.0, 38.0, 1.0, 0.0, 71.2833, 0.0], [3.0, 0.0, 26.0, 0.0, 0.0, 7.925, 2.0], [1.0, 0.0, 35.0, 1.0, 0.0, 53.1, 2.0], [3.0, 1.0, 35.0, 0.0, 0.0, 8.05, 2.0]]
[[1.0, 1.0, 37.0, 1.0, 1.0, 83.80399906, 1.0], [1.0, 1.0, 47.0, 0.0, 0.0, 43.38316529, 3.0], [3.0, 0.0, 21.0, 1.0, 1.0, 13.18401399, 3.0], [3.0, 0.0, 30.0, 0.0, 0.0, 8.694805595, 0.0], [3.0, 0.0, 10.0, 5.0, 2.0, 47.73594325, 0.0]]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
