## load tantinic dataset 

In [1]:
import torch

# 检查 GPU 是否可用
print(torch.cuda.is_available())  # 返回 True 表示可用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 查看当前 GPU 设备名称
print(torch.cuda.get_device_name(0))  # 例如: 'NVIDIA GeForce RTX 3090'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

import time

True
Using device: cuda:0
NVIDIA GeForce 840M
Using device: cuda:0


In [2]:
# import torch

# import matplotlib.pyplot as plt
# from torch.utils.data import Dataset #抽象类
# from torch.utils.data import DataLoader
# import numpy as np
# import pandas as pd
# from scipy.stats import mode
# class DiabetesDataset(Dataset):
#     def __init__(self,filepath):
#         df = pd.read_csv(filepath,header= 0)  # 自动处理混合类型
#         xy = df.values  # 转换为numpy数组（dtype=object）
#         print(df.dtypes)  # 查看每列的数据类型
        
#         #age
#         age_col = xy[:, 5].astype(float)
#         mean_age = np.nanmean(age_col)
#         print(f"年龄均值: {mean_age:.1f}") 
#         xy[:, 5] = np.where(np.isnan(age_col), mean_age, age_col)
        
#         gender_map = {'male': 0, 'female': 1,'nan' : 2}
#         xy[:, 4] = np.array([gender_map[x] for x in xy[:, 4]])
        
#         # Embarked
#         empty_mask = pd.isna(xy[:, 11]) | (xy[:, 11] == "")
#         embarked_mode = pd.Series(xy[:, 11]).mode()[0]  # 输出 'S'
#         xy[empty_mask, 11] = embarked_mode

#         Embarked_map = {'S': 0, 'C': 1,'Q': 2 }
#         xy[:, 11] = np.array([Embarked_map[x] for x in xy[:, 11]])
        
#         self.x_data = torch.from_numpy(xy[:, [2,4,5,6,7,9,11]].astype(np.float32))
#         # print (self.x_data)
#         self.y_data = torch.from_numpy(xy[:, [1]].astype(np.float32))
#         # print("y_data = ",self.y_data)
#         self.len = xy.shape[0]
        
#     def __getitem__(self,index):
#         return self.x_data[index],self.y_data[index]
#     def __len__(self):
#         return self.len
# class Model(torch.nn.Module):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.linear1 = torch.nn.Linear(7, 6)
#         self.linear2 = torch.nn.Linear(6, 4)
#         self.linear3 = torch.nn.Linear(4, 1)
#         self.sigmoid = torch.nn.Sigmoid()
#         self.relu = torch.nn.ReLU()

#     def forward(self, x):
#         x = self.relu( self.linear1(x))
#         x = self.relu( self.linear2(x))
#         x = self.sigmoid( self.linear3(x))
#         return x


# model = Model().to(device)

# criterion = torch.nn.BCELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [3]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from scipy.stats import mode

# 设置设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class DiabetesDataset(Dataset):
    def __init__(self, filepath):
        df = pd.read_csv(filepath, header=0)
        xy = df.values
        
        # 处理年龄列（填充缺失值）
        age_col = xy[:, 5].astype(float)
        mean_age = np.nanmean(age_col)
        print(f"年龄均值: {mean_age:.1f}")
        xy[:, 5] = np.where(np.isnan(age_col), mean_age, age_col)
        
        # 处理性别列
        gender_map = {'male': 0, 'female': 1, 'nan': 2}
        xy[:, 4] = np.array([gender_map.get(str(x).lower(), 2) for x in xy[:, 4]])
        
        # 处理Embarked列
        empty_mask = pd.isna(xy[:, 11]) | (xy[:, 11] == "")
        embarked_mode = pd.Series(xy[:, 11]).mode()[0]
        xy[empty_mask, 11] = embarked_mode
        Embarked_map = {'S': 0, 'C': 1, 'Q': 2}
        xy[:, 11] = np.array([Embarked_map.get(str(x).upper(), 0) for x in xy[:, 11]])
        
        # 转换为张量并移动到设备
        self.x_data = torch.from_numpy(xy[:, [2,4,5,6,7,9,11]].astype(np.float32)).to(device)
        self.y_data = torch.from_numpy(xy[:, [1]].astype(np.float32)).to(device)
        self.len = xy.shape[0]
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(7, 6)
        self.linear2 = nn.Linear(6, 4)
        self.linear3 = nn.Linear(4, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.sigmoid(self.linear3(x))
        return x


Using device: cuda:0


In [4]:
model = Model().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [7]:
# 训练循环示例
def train(dataloader, model, criterion, optimizer, epochs=100):
    batch_size= 32
    epoch_list = []
    cost_list =[]
    model.train()
    for epoch in range(epochs):
        loss_sum =0
        start_time = time.time()
        for batch_idx, (data, target) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss_sum += loss.item()
            loss.backward()
            optimizer.step()
            
            # if batch_idx % 10 == 0:
            #     print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx} | Loss: {loss.item():.4f}')
        elapsed_time = time.time() - start_time
        if epoch % 100 ==0:
                print(f"执行耗时: {elapsed_time:.4f} 秒",'epoch = ',epoch,'loss = ',loss_sum / (len(dataset) / batch_size + 1))
       
        cost_list.append(loss_sum / (len(dataset) / batch_size + 1))
        epoch_list.append(epoch)
        return epoch_list,cost_list

# 使用示例
if __name__ == "__main__":
    path = '../dataset/titanic/train.csv'
    dataset = DiabetesDataset(path)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    epoch_list , cost_list = train(train_loader, model, criterion, optimizer,1000000000)
    plt.figure(figsize=(8, 4))  # 设置图形大小
    plt.plot(epoch_list, cost_list, marker='o', linestyle='-', color='b')
    
    # 添加标题和标签
    plt.title('training')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend()
    
    # 显示图形
    plt.grid(True)  # 添加网格
    plt.show()


年龄均值: 29.7
执行耗时: 0.1309 秒 epoch =  0 loss =  0.6467756951229802
执行耗时: 0.0989 秒 epoch =  100 loss =  0.5988327204083311
执行耗时: 0.1059 秒 epoch =  200 loss =  0.5493775271646122
执行耗时: 0.1119 秒 epoch =  300 loss =  0.5056565575067555
执行耗时: 0.1049 秒 epoch =  400 loss =  0.4739528885383647
执行耗时: 0.0959 秒 epoch =  500 loss =  0.4787558052418421
执行耗时: 0.1109 秒 epoch =  600 loss =  0.46403504037908716
执行耗时: 0.1329 秒 epoch =  700 loss =  0.46936594548747185
执行耗时: 0.0979 秒 epoch =  800 loss =  0.4266088987869147
执行耗时: 0.0950 秒 epoch =  900 loss =  0.4471603705591138
执行耗时: 0.0929 秒 epoch =  1000 loss =  0.4418496351975661
执行耗时: 0.0959 秒 epoch =  1100 loss =  0.43634525538782387
执行耗时: 0.0965 秒 epoch =  1200 loss =  0.4209283568360565
执行耗时: 0.1010 秒 epoch =  1300 loss =  0.40849276611890206
执行耗时: 0.0949 秒 epoch =  1400 loss =  0.4104454230642267
执行耗时: 0.0969 秒 epoch =  1500 loss =  0.4187244392552071
执行耗时: 0.0909 秒 epoch =  1600 loss =  0.40262846569586164
执行耗时: 0.0940 秒 epoch =  1700 loss =  0.41243

KeyboardInterrupt: 

In [12]:
class TitanicDataset(Dataset):
    def __init__(self,filepath):
        df = pd.read_csv(filepath,header= 0)  # 自动处理混合类型
        xy = df.values  # 转换为numpy数组（dtype=object）
        print(df.dtypes)  # 查看每列的数据类型
        
        #age
        age_col = xy[:, 4].astype(float)
        mean_age = np.nanmean(age_col)
        print(f"年龄均值: {mean_age:.1f}") 
        xy[:, 4] = np.where(np.isnan(age_col), mean_age, age_col)
        gender_map = {'male': 0, 'female': 1,'nan' : 2}
        xy[:, 3] = np.array([gender_map[x] for x in xy[:, 3]])
        
        # Embarked
        empty_mask = pd.isna(xy[:, 10]) | (xy[:, 10] == "")
        embarked_mode = pd.Series(xy[:, 10]).mode()[0]  # 输出 'S'
        xy[empty_mask, 10] = embarked_mode

        Embarked_map = {'S': 0, 'C': 1,'Q': 2 }
        xy[:, 10] = np.array([Embarked_map[x] for x in xy[:, 10]])
        
        self.x_data = torch.from_numpy(xy[:, [1,3,4,5,6,8,10]].astype(np.float32)).to(device)
        # print (self.x_data)
        # self.y_data = torch.from_numpy(xy[:, [1]].astype(np.float32))
        # # print("y_data = ",self.y_data)
        self.len = xy.shape[0]
        
    def __getitem__(self,index):
        return self.x_data[index]
    def __len__(self):
        return self.len

In [13]:
path = '../dataset/titanic/test.csv'
test_data = TitanicDataset(path)
test_loader = DataLoader(dataset=test_data,batch_size= 32,shuffle=False,num_workers=0)

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
年龄均值: 30.3


In [14]:
for i,data in enumerate(test_loader,0):
    # prepare data
    inputs = data
    # forward
    y_pred  = model(inputs)
    print ('y_pred = ',y_pred)

y_pred =  tensor([[0.0866],
        [0.3255],
        [0.0606],
        [0.0709],
        [0.2782],
        [0.6919],
        [0.7030],
        [0.1725],
        [0.6814],
        [0.1324],
        [0.0723],
        [0.4311],
        [0.9921],
        [0.0631],
        [0.9952],
        [0.9691],
        [0.1041],
        [0.0817],
        [0.3248],
        [0.2967],
        [0.5697],
        [0.2252],
        [0.9972],
        [0.6447],
        [0.9964],
        [0.0764],
        [0.9897],
        [0.0815],
        [0.5359],
        [0.0991],
        [0.0641],
        [0.1513]], device='cuda:0', grad_fn=<SigmoidBackward0>)
y_pred =  tensor([[0.3014],
        [0.3841],
        [0.5459],
        [0.0819],
        [0.3337],
        [0.4800],
        [0.0694],
        [0.1698],
        [0.0640],
        [0.4647],
        [0.0714],
        [0.9308],
        [0.9952],
        [0.0726],
        [0.3100],
        [0.0872],
        [0.9832],
        [0.3926],
        [0.3134],
        [0.2720]