In [26]:
import os
import warnings
# 禁用警告
warnings.filterwarnings("ignore")
device="cuda"
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import gc
import time
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import GroupKFold
NUM_WORKERS = 4


In [27]:
DATA_PATH = r"D:\data\kaggle\ventilator-pressure-prediction\\"
sub = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')

In [28]:
df_train.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [29]:
df_train.shape

(6036000, 8)

In [30]:
df_train2 = df_train

In [40]:
import torch
from torch.utils.data import Dataset

class VentilatorDataset(Dataset):
    def __init__(self, df):
        if "pressure" not in df.columns:
            df['pressure'] = 0

        self.df = df.groupby('breath_id').agg(list).reset_index()
        # print(self.df)
        self.prepare_data()
                
    def __len__(self):
        return self.df.shape[0]
    
    def prepare_data(self):
        self.pressures = np.array(self.df['pressure'].values.tolist())
        
        rs = np.array(self.df['R'].values.tolist())
        cs = np.array(self.df['C'].values.tolist())
        u_ins = np.array(self.df['u_in'].values.tolist())
        
        self.u_outs = np.array(self.df['u_out'].values.tolist())
        # print(rs.shape)
        # print(cs.shape)
        # print(u_ins.shape)
        # print(self.u_outs.shape)
        # print(rs[:, None].shape)
        # print(cs[:, None].shape)
        # print(u_ins[:, None].shape)
        # print(np.cumsum(u_ins, 1)[:, None].shape)
        # print(self.u_outs[:, None].shape)
        self.inputs = np.concatenate([
            rs[:, None], 
            cs[:, None], 
            u_ins[:, None], 
            np.cumsum(u_ins, 1)[:, None],
            self.u_outs[:, None]
        ], 1).transpose(0, 2, 1)
        print(self.inputs.shape)
        print(type(self.inputs))
    def __getitem__(self, idx):
        data = {
            "input": torch.tensor(self.inputs[idx], dtype=torch.float),
            "u_out": torch.tensor(self.u_outs[idx], dtype=torch.float),
            "p": torch.tensor(self.pressures[idx], dtype=torch.float),
        }
        
        return data

In [41]:
# train_dataset = VentilatorDataset(df_train)
# val_dataset = VentilatorDataset(df_val)
test_dataset = VentilatorDataset(df_test)

(50300, 80, 5)
<class 'numpy.ndarray'>


In [None]:
# Data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

In [20]:

class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim=5,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,
    ):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.ReLU(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.ReLU(),
        )

        self.lstm = nn.LSTM(
            input_size=dense_dim, 
            hidden_size=lstm_dim, 
            batch_first=True, 
            bidirectional=True)

        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim),
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes),
        )

    def forward(self, x):
        features = self.mlp(x)
        features, _ = self.lstm(features)
        pred = self.logits(features)
        return pred

In [21]:
model = RNNModel(
        input_dim=5,
        lstm_dim=256,
        dense_dim=256,
        logit_dim=256,
        num_classes=1,)

In [None]:
class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)

        return mae

In [None]:
loss_fct = VentilatorLoss()

In [None]:
lr = 0.001
optimizer = getattr(torch.optim, "Adam")(model.parameters(), lr=lr)

In [None]:
def compute_metric(df, preds):
    """
    Metric for the problem, as I understood it.
    """
    
    y = np.array(df['pressure'].values.tolist())
    w = 1 - np.array(df['u_out'].values.tolist())
    
    assert y.shape == preds.shape and w.shape == y.shape, (y.shape, preds.shape, w.shape)
    
    mae = w * np.abs(y - preds)
    mae = mae.sum() / w.sum()
    
    return mae

In [None]:
epochs = 10
for epoch in range(epochs):
    model.train()
    model.zero_grad()
    start_time = time.time()

    avg_loss = 0
    for data in train_loader:
        pred = model(data['input'].to(device)).squeeze(-1)

        loss = loss_fct(
            pred,
            data['p'].to(device),
            data['u_out'].to(device),
        ).mean()
        loss.backward()
        avg_loss += loss.item() / len(train_loader)

        optimizer.step()

        for param in model.parameters():
            param.grad = None

    model.eval()
    mae, avg_val_loss = 0, 0
    preds = []

    with torch.no_grad():
        for data in val_loader:
            pred = model(data['input'].to(device)).squeeze(-1)

            loss = loss_fct(
                pred.detach(), 
                data['p'].to(device),
                data['u_out'].to(device),
            ).mean()
            avg_val_loss += loss.item() / len(val_loader)

            preds.append(pred.detach().cpu().numpy())
    
    preds = np.concatenate(preds, 0)
    mae = compute_metric(val_Data.df, preds)

    elapsed_time = time.time() - start_time
    
    elapsed_time = elapsed_time
    print(
        f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e}\t t={elapsed_time:.0f}s \t"
        f"loss={avg_loss:.3f}",
        end="\t",
    )

    if (epoch + 1 >= 0) or (epoch + 1 == epochs):
        print(f"val_loss={avg_val_loss:.3f}\tmae={mae:.3f}")
    else:
        print("")

KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "e:\conda\envs\py39_tf\lib\site-packages\torch\utils\data\_utils\worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "e:\conda\envs\py39_tf\lib\site-packages\torch\utils\data\_utils\fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "e:\conda\envs\py39_tf\lib\site-packages\torch\utils\data\_utils\fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
KeyError: 0
