In [7]:
import math
import numpy as np
import pandas as pd
import os
import csv
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [8]:
def same_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [9]:
def train_valid_split(data_set, valid_ratio, seed):
    # valid_ratio是验证集比例，我们通过这个比例来划分训练集和验证集
    #一整个的数据集就是一个data_set，我们用他的长度乘以valid_ratio来规定验证集的长度
    #为了避免出现小数，我们用int()来取整,并保存在valid_set_size中
    valid_set_size = int(valid_ratio * len(data_set))
    #计算训练集大小
    train_set_size = len(data_set)-valid_set_size
    #我们用随机数来作为划分训练集和验证集的比例
    train_set, valid_set = random_split(data_set,[train_set_size, valid_set_size],generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

In [10]:
def predict(test_loader, model, device):
    #将模型设置为评估模式，这会关闭dropout和batch normalization
    model.eval()
    preds=[]
    for x in tqdm(test_loader):# 使用tqdm显示进度条
        x = x.to(device) # 将输入数据移动到device
        with torch.no_grad():# 禁用梯度运算
            pred = model(x) # 获取模型对输入数据的预测
            preds.append(pred.detach().cpu()) # 将预测结果从计算图中分离出来，转移到 CPU 上，并将其添加到 preds 列表中
    preds = torch.cat(preds, dim=0).numpy()
    return preds

In [11]:
class COVID19Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)
    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]
    def __len__(self):
        return len(self.x)

In [12]:
class My_Model(nn.Module):
    def __init__(self, input_dim):# input_dim是输入数据的特征维度
        super(My_Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),# 输入层
            nn.ReLU(),# 激活函数
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
    def forward(self,x):
        x = self.layers(x) # 前向传播
        x = x.squeeze(1) # 去掉最外层的维度 (B,1) -> (B)
        return x


In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
train_data = pd.read_csv('./covid.train.csv')
train_data.head()
# iloc[]函数，属于pandas库，即对数据进行位置索引，从而在数据表中提取出相应的数据。
x_data, y_data= train_data.iloc[:, 0:88], train_data.iloc[:, 88]

k = 17 # 选择k个相关性最强的特征，是超参之一
# 调用sklearn中的SelectKBest函数
'''
  SelectKBest:
    score_func: 评估指标，可选值包括：卡方检验（chi2）、互信息（mutual_info_classif、mutual_info_regression）等，默认为卡方检验。
    k: 选择排名靠前的k个特征，默认为10。

'''

selector = SelectKBest(score_func=f_regression, k=k)
result = selector.fit(x_data, y_data)

# result.scores_ includes scores for each features
# np.argsort sort scores in ascending order by index, we reverse it to make it descending.
idx = np.argsort(result.scores_)[::-1] # 每个feature关于score_func的倒序排序的index
print(f'Top {k} Best feature score ')
print(result.scores_[idx[:k]]) # 选取前k个特征

print(f'\nTop {k} Best feature index ')
print(idx[:k])

print(f'\nTop {k} Best feature name')
print(x_data.columns[idx[:k]])

selected_idx = list(np.sort(idx[:k]))
print(selected_idx)
print(x_data.columns[selected_idx])

Top 17 Best feature score 
[876214.8322447  335591.6630519  311015.24307534 205326.08451419
 181409.55205618 139190.43353702  17703.25449295  17532.56653353
  17360.80858132  17153.20445066  17005.71942696  16802.10300361
  16289.94077877  16133.22580844  11089.09711071  11031.55168611
  10775.62521955]

Top 17 Best feature index 
[72 56 73 57 40 41 87 86 71 70 55 54 39 38 85 69 53]

Top 17 Best feature name
Index(['hh_cmnty_cli.2', 'hh_cmnty_cli.1', 'nohh_cmnty_cli.2',
       'nohh_cmnty_cli.1', 'hh_cmnty_cli', 'nohh_cmnty_cli', 'ili.3', 'cli.3',
       'ili.2', 'cli.2', 'ili.1', 'cli.1', 'ili', 'cli', 'tested_positive.2',
       'tested_positive.1', 'tested_positive'],
      dtype='object')
[np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(69), np.int64(70), np.int64(71), np.int64(72), np.int64(73), np.int64(85), np.int64(86), np.int64(87)]
Index(['cli', 'ili', 'hh_cmnty_cli', 'nohh_cmnty_cli', 'tes

In [15]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    y_train, y_valid = train_data[:,-1], valid_data[:,-1] # 获取标签
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data # 获取特征
    if select_all:
        feat_idx = list(range(raw_x_train.shape[1])) # 选择所有特征
    else:
        feat_idx = [0,1,2,3,4] # 选择部分特征
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid

In [16]:
def trainer(train_loader, valid_loader, model, config, device):
    # 使用均方误差作为损失函数, reduction一共有三个可选值
    #none表示不进行任何缩减，返回每个样本的损失；'mean'表示计算所有样本损失的的均值；sum表示返回所有样本的损失之和
    criterion = nn.MSELoss(reduction = 'mean')
    # optimizer是优化器实例，将用于更新模型的参数以最小化损失函数
    # SGD是pytorch提供的随机梯度下降优化器； model.parameters()会返回模型中所有需要优化的参数
    # lr是学习率，是一个超参数，这里使用config中的学习率
    # momentum是动量因子，用于加速 SGD 优化器收敛，动量项可以帮助优化器在损失函数的凹槽中稳定下来，避免震荡，并且在陡峭方向上加速收敛
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)
    # Writer是一个用于记录训练过程的类，./model是record的保存路径
    Writer = SummaryWriter()
    if not os.path.isdir('./models'):
        os.mkdir('./models')
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    for epoch in range(n_epochs):
        model.train() # 将模型设置为训练模式
        loss_record=[]
        # 使用tqdm显示进度条
        train_pbar = tqdm(train_loader, position=0, leave=True)
        for x, y in train_pbar:
            optimizer.zero_grad() # set gradient to zero
            x, y = x.to(device), y.to(device) # 将输入数据和标签移动到device
            pred = model(x) # 获取模型对输入数据的预测
            loss = criterion(pred, y) # 计算损失
            loss.backward()# 反向传播
            optimizer.step() # 更新参数,执行一步优化, (参数 = 参数 - 学习率 * 参数的梯度), 优化器会自动更新模型的参数
            step +=1
            loss_record.append(loss.detach().item())# 将损失记录到loss_record中,detach()是为了防止内存泄漏,因为loss是一个计算图,我们只需要数值,item()是将tensor转换为python数值
            # epoch是当前的轮次，n_epochs是总的轮次
            # loss是当前轮次的损失
            # set_description是设置进度条的描述
            # set_postfix是设置进度条的后缀
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train',mean_train_loss, step)

        model.eval() # set your model to evaluation mode
        loss_record = []
        for x, y in valid_loader:# 从验证数据集中批量加载数据
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)
            loss_record.append(loss.item())
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss:{mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])# save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0 # reset early_stop_count, because we find a better model
        else:
                early_stop_count += 1
        
        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 5201314,
    'select_all': True,# whether to use all features
    'valid_ratio': 0.2,
    'n_epochs': 3000,
    'batch_size': 256,
    'learning_rate': 1e-5,
    'early_stop': 400, #if model has not been improved for 400 epochs, we will stop training
    'save_path': './models/model.ckpt'
}

In [18]:
same_seed(config['seed'])
train_data, test_data = pd.read_csv('./covid.train.csv').values, pd.read_csv('./covid.test.csv').values
train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])

#print out the data size
print(f"""train_data size: {train_data.shape}
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}""")

#select features
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])

#print out the number of features
print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train),COVID19Dataset(x_valid, y_valid), COVID19Dataset(x_test)

# pytorch data loader loads pytorch dataset into batches
#shuffle=True means that the data will be shuffled before each epoch,shuffle即打乱顺序
#pin_memory=True means that the data loader will copy Tensors into CUDA pinned memory before returning them
#即：在将数据传递给 GPU 进行计算之前，首先将它们复制到固定内存中，以提高数据传输速度。
#固定内存是一种特殊类型的主机内存，它可以更快地与 GPU 内存进行数据交换。将数据复制到固定内存中可以加速从主机内存到 GPU 内存的数据传输。
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (2160, 118)
valid_data size: (539, 118)
test_data size: (1078, 117)
number of features: 117


In [19]:
model = My_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader, valid_loader, model, config, device)

  0%|          | 0/9 [00:00<?, ?it/s]


RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`