In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

train_csv = pd.read_csv('titanic/train.csv')
features_csv = pd.read_csv('all_features.csv')
labels_csv = pd.read_csv('all_labels.csv')
del features_csv['Unnamed: 0']
del labels_csv['Unnamed: 0']
features_csv['Cabin'].value_counts() # 会增加187个维度，小于训练数据

Cabin
Missing            1014
C23 C25 C27           6
B57 B59 B63 B66       5
G6                    5
F33                   4
                   ... 
A14                   1
E63                   1
E12                   1
E38                   1
C105                  1
Name: count, Length: 187, dtype: int64

In [2]:
# Cabin特征进行one-hot encoding编码操作
features_csv = pd.get_dummies(features_csv, dummy_na=False, dtype=int)
features_csv

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,IsAlone,Cabin_A10,Cabin_A11,...,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_Missing,Cabin_T
0,3,0,1,1,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1,1,0,1.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,1,1,0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,0,1,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1305,1,1,1,0,0,2.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1306,3,0,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1307,3,0,1,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
# 转换数据为tensor
train_features = torch.tensor(data=features_csv[:train_csv.shape[0]].values, device="cuda", dtype=torch.float)
train_labels = torch.tensor(data=labels_csv.values, device="cuda", dtype=torch.float)
test_features = torch.tensor(data=features_csv[train_csv.shape[0]:].values, device="cuda", dtype=torch.float)
train_features

tensor([[3., 0., 1.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [3., 1., 1.,  ..., 0., 1., 0.],
        ...,
        [3., 1., 1.,  ..., 0., 1., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [3., 0., 1.,  ..., 0., 1., 0.]], device='cuda:0')

In [None]:
# 超参
num_epochs, lr, batch_size = 100, 0.01, 100

In [None]:
# 构建数据集
features_dataset = TensorDataset(train_features, train_labels)
features_dataloader = DataLoader(dataset=features_dataset, batch_size=100, shuffle=True)

In [51]:
class LogisticRegression(nn.Module):
    """逻辑回归分类

    Args:
        nn (Module): PyTorch Module
    """
    def __init__(self):
        """初始化模型
        """
        super().__init__()
        # 推荐使用nn.Linear + BCEWithLogitsLoss（内部结合了Sigmoid和二元交叉熵，避免单独使用Sigmoid可能导致的数值不稳定问题）
        self.net = nn.Linear(in_features=train_features.shape[1], out_features=1)
        # 参数初始化
        self.net.weight = nn.init.normal_(self.net.weight, mean=0, std=0.1)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """前向传播

        Args:
            x (torch.Tensor): 特征

        Returns:
            torch.Tensor: 目标
        """
        return self.net(x)
# 模型测试
net = LogisticRegression()
net.to("cuda")
net.eval()
with torch.no_grad():
    logist = net(train_features[0:2])
logist

tensor([[ 0.2583],
        [-0.0845]], device='cuda:0')

In [59]:
def calculate_accuracy(net: nn.Module, datasets: DataLoader) -> float:
    """计算分类准确率

    Args:
        net (nn.Module): 网络
        datasets (DataLoader): 数据集

    Returns:
        float: 准确率
    """
    net.to("cuda")
    net.eval()
    
    correction_predictions, num_examples = 0, 0 # 定义正确预测数量和样本总数
    with torch.no_grad():
        for features, labels in datasets:
            # 模型输出
            logist = net(features)
            # 转换为概率
            prob = nn.functional.sigmoid(logist)
            # 预测类别（概率＞0.5 → 1，否则 → 0）
            pred = (prob > 0.5).float()
            # 计算样本数量
            num_examples += pred.shape[0]
            # 批次中预测正确的数量
            correction_predictions += ((pred == labels).sum().item())
    # 计算正确率
    return correction_predictions / num_examples

calculate_accuracy(net=net, datasets=features_dataloader)

0.6161616161616161

In [58]:
def calculate_loss(net: nn.Module, datasets: DataLoader) -> float:
    """计算损失

    Args:
        net (nn.Module): 模型
        datasets (DataLoader): 数据集

    Returns:
        float: 损失
    """
    net.to("cuda")
    net.eval()
    
    all_loss, num_examples = 0, 0
    with torch.no_grad():
        for features, labels in datasets:
            logist = net(features)
            all_loss += nn.functional.binary_cross_entropy_with_logits(logist, labels).item()
            num_examples += logist.shape[0]
    # 计算平均损失
    return all_loss / num_examples

calculate_loss(net=net, datasets=features_dataloader)

nan

In [61]:
from tqdm.notebook import tqdm

def train(net: nn.Module, datasets: DataLoader, lr: float, num_epochs: int) -> tuple[list[float], list[float]]:
    """模型训练

    Args:
        net (nn.Module): 模型
        datasets (DataLoader): 数据集
        lr (float): 学习率
        num_epochs (int): 批次大小
    """
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()
    tqdm_instance = tqdm(range(num_epochs), desc="训练中")
    
    accuracy, losses = [], []
    
    for _ in tqdm_instance:
        for features, labels in datasets:
            optimizer.zero_grad()
            logist = net(features)
            loss: torch.Tensor = criterion(logist, labels)
            loss.backward()
            optimizer.step()
        accuracy.append(calculate_accuracy(net=net, datasets=datasets))
        mean_loss = calculate_loss(net=net, datasets=datasets)
        losses.append(mean_loss)
        
        tqdm_instance.set_postfix(loss=mean_loss)
    return accuracy, losses
accuracy, losses = train(net=net, datasets=features_dataloader, num_epochs=100, lr=0.001)

训练中:   0%|          | 0/100 [00:00<?, ?it/s]