In [7]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [8]:
# 加载训练集
train_val_data = pd.read_csv('作业四/╫≈╥╡╦─/train.csv')

# 加载测试数据集
test_data = pd.read_csv('作业四/╫≈╥╡╦─/test.csv')

In [9]:
# 查看训练集基本信息
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  380 non-null    int64  
 1   Pclass       380 non-null    int64  
 2   Name         380 non-null    object 
 3   Sex          380 non-null    object 
 4   Age          302 non-null    float64
 5   SibSp        380 non-null    int64  
 6   Parch        380 non-null    int64  
 7   Ticket       80 non-null     object 
 8   Fare         379 non-null    float64
 9   Cabin        80 non-null     object 
 10  Embarked     350 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 32.8+ KB


In [10]:
# 查看测试集基本信息
train_val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
# 识别具有NaN值的列
train_val_nan_counts = train_val_data.isna().sum()
train_val_columns_with_nan = train_val_nan_counts[train_val_nan_counts > 0]
print('训练集中具有缺失值的特征且其缺失值数量')
print(train_val_columns_with_nan)

test_nan_counts = test_data.isna().sum()
test_columns_with_nan = test_nan_counts[test_nan_counts > 0]
print('测试集中具有缺失值的特征且其缺失值数量')
print(test_columns_with_nan)


训练集中具有缺失值的特征且其缺失值数量
Age         177
Cabin       687
Embarked      2
dtype: int64
测试集中具有缺失值的特征且其缺失值数量
Age          78
Ticket      300
Fare          1
Cabin       300
Embarked     30
dtype: int64


In [12]:
# 数据预处理
# 填充训练集中缺失值
train_val_data.fillna({'Age': train_val_data['Age'].mean()}, inplace=True)
train_val_data.fillna({'Embarked': train_val_data['Embarked'].mode()[0]}, inplace=True)

# 填充测试集中缺失值
# 用训练集的值填补缺失
test_data.fillna({'Age' : train_val_data['Age'].mean()}, inplace=True)
test_data.fillna({'Embarked' : train_val_data['Embarked'].mode()[0]}, inplace=True)  # 用训练集的众数填补缺失的登船港口
test_data.fillna({'Fare': train_val_data['Fare'].mean()}, inplace=True)  # 用训练集的平均值填补缺失的票价

In [13]:
# 处理分类特征
label_encoder = LabelEncoder()
train_val_data['Sex'] = label_encoder.fit_transform(train_val_data['Sex'])
train_val_data['Embarked'] = label_encoder.fit_transform(train_val_data['Embarked'])

test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])
test_data['Embarked'] = label_encoder.fit_transform(test_data['Embarked'])

In [14]:
# 特征选择
# passangerId, Name, Ticket都对乘客是否生还没关系，而Cabin因为太多缺失值，所以也不作为训练特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_train_val = train_val_data[features]
y_train_val = train_val_data['Survived']

X_test = test_data[features]

In [15]:
# 划分训练集的测试集
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [16]:
# 定义计算熵的函数
def entropy(y):
    unique_classes, counts = np.unique(y, return_counts=True)  # 获取标签中的唯一类及其计数
    probabilities = counts / counts.sum()  # 计算每个类的概率
    return -np.sum(probabilities * np.log2(probabilities))  # 计算熵

# 定义计算信息增益的函数
def information_gain(y, y_left, y_right):
    p_left = len(y_left) / len(y)  # 左子集占整个集合的比例
    p_right = len(y_right) / len(y)  # 右子集占整个集合的比例
    return entropy(y) - p_left * entropy(y_left) - p_right * entropy(y_right)  # 计算信息增益

In [17]:
# 定义构建决策树的节点类
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature  # 用于分裂的特征索引
        self.threshold = threshold  # 分裂的阈值
        self.left = left  # 左子节点
        self.right = right  # 右子节点
        self.value = value  # 叶子节点的值

# 定义构建决策树的类
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # 最大深度
        self.root = None  # 根节点

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)  # 构建决策树

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape  # 样本数量和特征数量
        # 如果样本数量少于等于1，或标签唯一，或达到最大深度，则创建叶子节点
        if num_samples <= 1 or len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            leaf_value = self._most_common_label(y)  # 叶子节点的值为数据集中最常见的标签
            return Node(value=leaf_value)

        best_feature, best_threshold = self._best_split(X, y)  # 寻找最佳分裂
        if best_feature is None:  # 如果找不到最佳分裂，则创建叶子节点
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        left_indices = X[:, best_feature] < best_threshold  # 左子集的索引
        right_indices = X[:, best_feature] >= best_threshold  # 右子集的索引
        left_child = self._grow_tree(X[left_indices], y[left_indices], depth + 1)  # 构建左子树
        right_child = self._grow_tree(X[right_indices], y[right_indices], depth + 1)  # 构建右子树
        return Node(best_feature, best_threshold, left_child, right_child)  # 返回当前节点

    def _best_split(self, X, y):
        best_gain = -1  # 初始化最佳增益
        split_index, split_threshold = None, None  # 初始化最佳分裂点
        for feature_index in range(X.shape[1]):  # 遍历每个特征
            thresholds = np.unique(X[:, feature_index])  # 获取特征的唯一值
            for threshold in thresholds:  # 遍历每个唯一值
                left_indices = X[:, feature_index] < threshold  # 左子集的索引
                right_indices = X[:, feature_index] >= threshold  # 右子集的索引
                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:  # 如果子集为空，则跳过
                    continue
                gain = information_gain(y, y[left_indices], y[right_indices])  # 计算信息增益
                if gain > best_gain:  # 如果增益大于当前最佳增益，则更新最佳分裂点
                    best_gain = gain
                    split_index = feature_index
                    split_threshold = threshold
        return split_index, split_threshold  # 返回最佳分裂点

    def _most_common_label(self, y):
        return np.bincount(y).argmax()  # 返回数据集中最常见的标签

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])  # 对每个输入样本进行预测

    def _predict(self, inputs):
        node = self.root  # 从根节点开始
        while node.value is None:  # 如果不是叶子节点
            if inputs[node.feature] < node.threshold:  # 根据特征值和阈值判断走向左子树还是右子树
                node = node.left
            else:
                node = node.right
        return node.value  # 返回叶子节点的值

In [18]:
# 训练自定义决策树模型
tree = DecisionTree(max_depth=10)  # 设置最大深度为10
tree.fit(X_train.values, y_train.values)  # 训练模型

In [19]:
# 预测
y_pred = tree.predict(X_val.values)

In [20]:
# 模型评估
accuracy = accuracy_score(y_val.values, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_val, y_pred))
print('confusion_matrix:')
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       105
           1       0.82      0.68      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.79      0.79       179
weighted avg       0.81      0.80      0.80       179

confusion_matrix:
[[94 11]
 [24 50]]


In [21]:
# 预测
y_test_pred = tree.predict(X_test.values)

In [22]:
# 加载submission文件
submission = pd.read_csv('submission.csv')


In [23]:
# 将预测结果填充到submission.csv
submission['Survived'] = y_test_pred

In [26]:
# 保存预测结果到submission.csv
submission.to_csv('submission.csv', index=False)