In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn

train_csv = pd.read_csv('titanic/train.csv')
features_csv = pd.read_csv('all_features.csv')
labels_csv = pd.read_csv('all_labels.csv')
del features_csv['Unnamed: 0']
del labels_csv['Unnamed: 0']
features_csv['Cabin'].value_counts() # 会增加187个维度，小于训练数据

Cabin
Missing            1014
C23 C25 C27           6
B57 B59 B63 B66       5
G6                    5
F33                   4
                   ... 
A14                   1
E63                   1
E12                   1
E38                   1
C105                  1
Name: count, Length: 187, dtype: int64

In [22]:
# Cabin特征进行one-hot encoding编码操作
features_csv = pd.get_dummies(features_csv, dummy_na=False, dtype=int)
features_csv

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,IsAlone,Cabin_A10,Cabin_A11,...,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_Missing,Cabin_T
0,3,0,1,1,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1,1,0,1.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,1,1,0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,0,1,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1305,1,1,1,0,0,2.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1306,3,0,1,0,0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1307,3,0,1,0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
# 转换数据为tensor
train_features = torch.tensor(data=features_csv[:train_csv.shape[0]].values, device="cuda", dtype=torch.float)
train_labels = torch.tensor(data=labels_csv.values, device="cuda", dtype=torch.float)
test_features = torch.tensor(data=features_csv[train_csv.shape[0]:].values, device="cuda", dtype=torch.float)
train_features

tensor([[3., 0., 1.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [3., 1., 1.,  ..., 0., 1., 0.],
        ...,
        [3., 1., 1.,  ..., 0., 1., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [3., 0., 1.,  ..., 0., 1., 0.]], device='cuda:0')

In [None]:
class LogisticRegression(nn.Module):
    """逻辑回归分类

    Args:
        nn (Module): PyTorch Module
    """
    def __init__(self):
        """初始化模型
        """
        super().__init__()
        # 推荐使用nn.Linear + BCEWithLogitsLoss（内部结合了Sigmoid和二元交叉熵，避免单独使用Sigmoid可能导致的数值不稳定问题）
        self.net = nn.Sequential(nn.Linear(in_features=train_features.shape[1], out_features=2))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """前向传播

        Args:
            x (torch.Tensor): 特征

        Returns:
            torch.Tensor: 目标
        """
        return self.net(x)
# 模型测试
net = LogisticRegression()
net.to("cuda")
net.eval()
with torch.no_grad():
    logist = net(train_features[0])
logist

tensor([-0.0519, -0.2162], device='cuda:0')