# Kaggle Competition
## Google Brain - Ventilator Pressure Prediction
### Simulate a ventilator connected to a sedated patient's lung

<https://www.kaggle.com/c/ventilator-pressure-prediction>

I can take inspiration from <https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter>

## Notes

- maybe from test `breath_id` I can infer the type of profile
- I can predict in the frequency domain, with a fourier transform

In [2]:
# Basic
import numpy as np
import pandas as pd
import pickle
import gzip
from pathlib import Path

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

# Various regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# PyTorch regressor
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

# Visualization
import ipywidgets
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm

input_path = Path('input')
data_path = Path('data')
plot_path = Path('plots')
model_path = Path('models')


# Preprocessing

In [4]:
def flatten_df(df, time_delta_scaler, pressure_scaler, should_fit):
    # Remove the expiration phase
    df = df[((df['id'] - 1) % 80) < 32].copy()

    # adding time_delta
    df['time_delta'] = df['time_step'] - df.groupby('breath_id')['time_step'].shift(1)
    df['time_delta'] = df['time_delta'].clip(upper=0.04) # some time_delta are really huge, probably capping them is a good idea
    df['time_delta'] = df['time_delta'].fillna(df['time_delta'].mean()) # set first time-delta as mean

    if should_fit:
        # Prepare for rescaling. should be done only on training, but meh.
        time_delta_scaler.fit(df['time_delta'].values.reshape(-1, 1))
        pressure_scaler.fit(df['pressure'].values.reshape(-1, 1))

    # Scaling
    df['time_delta'] = time_delta_scaler.transform(df[['time_delta']].values)
    if 'pressure' in df:
        df['pressure'] = pressure_scaler.transform(df[['pressure']].values)
    else:
        df['pressure'] = 0
    df['u_in'] = df['u_in'] / 100
    df['R'] = df['R'].map({5:0, 20:0.5, 50:1})
    df['C'] = df['C'].map({10:0, 20:0.5, 50:1})

    # Transpose
    pressure_df = df.groupby('breath_id')['pressure'].apply(lambda dff: dff.reset_index(drop=True)).unstack()

    R_C = df.groupby('breath_id')[['R', 'C']].mean()
    time_out = df[df['u_out'] == 0].groupby('breath_id')[['time_step', 'id']].max()
    time_out['id'] = (time_out['id'] - 1) % 80 + 1

    df = df.groupby('breath_id')[['u_in', 'time_delta']].apply(lambda dff: dff.reset_index(drop=True)).unstack()
    df['R'] = R_C['R']
    df['C'] = R_C['C']
    df['index_out'] = time_out['id']
    return df.reset_index(), pressure_df.reset_index()


time_delta_scaler, pressure_scaler = MinMaxScaler(), MinMaxScaler()

df = pd.read_csv(input_path / 'train.csv')
df, pressure_df = flatten_df(df, time_delta_scaler, pressure_scaler, should_fit=True)

df_test = pd.read_csv(input_path / 'test.csv')
df_test, pressure_test = flatten_df(df_test, time_delta_scaler, pressure_scaler, should_fit=False)

## Clustering

In [5]:
n_clusters = 20
# clustering = KMeans(n_clusters=n_clusters, random_state=0).fit(df['u_in']) # doesn't work well
clustering = DBSCAN().fit(df['u_in'])
df['cluster'] = clustering.predict(df['u_in'])
df_test['cluster'] = clustering.predict(df_test['u_in'])

## Train Validation split

In [6]:
df_train, df_valid, pressure_train, pressure_valid = train_test_split(df, pressure_df, stratify=df[['cluster']], test_size=0.2, random_state=0)

for dff in [df_train, pressure_train, df_valid, pressure_valid]:
    dff.index = dff['breath_id'].values
    dff.sort_index(inplace=True)

with open(data_path / 'flat_train.pickle', 'wb') as handle:
    pickle.dump((df_train, pressure_train), handle)
with open(data_path / 'flat_valid.pickle', 'wb') as handle:
    pickle.dump((df_valid, pressure_valid), handle)
with open(data_path / 'flat_test.pickle', 'wb') as handle:
    pickle.dump((df_test, pressure_test), handle)
with open(data_path / 'reverse_transform.pickle', 'wb') as handle:
    pickle.dump((time_delta_scaler, pressure_scaler), handle)

# Data Exploration

In [None]:
with open(plot_path / 'flat_train.pickle', 'rb') as handle:
    df, pressures = pickle.load(handle)

## Time Deltas

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=time_deltas, nbinsx=500))
fig.write_html(plot_path / 'timedeltas.html') # That's a pretty weird distribution

## Clustering

In [7]:
# R and C stratification
display(df[['R', 'C']].value_counts(normalize=True).unstack())
display(df_test[['R', 'C']].value_counts(normalize=True).unstack())

"(C, )",0.0,0.5,1.0
"(R, )",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.110166,0.109702,0.109622
0.5,0.080451,0.08228,0.108496
1.0,0.181272,0.109476,0.108535


"(C, )",0.0,0.5,1.0
"(R, )",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.108091,0.10837,0.10829
0.5,0.085328,0.081272,0.109344
1.0,0.180537,0.109404,0.109364


In [None]:
# Plot clustering PCA
reduced = PCA(n_components=2).fit_transform(df['u_in'])

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=reduced[:,0], y=reduced[:,1], marker_color=df['cluster'], mode='markers'
))
fig.write_html(plot_path / 'clustering_DBSCAN_pca.html')

# Plot clustering breath profiles
fig = make_subplots(rows=5, cols=4,
    subplot_titles=[f'Cluster {cluster} N={size}' for cluster, size in df['cluster'].value_counts().sort_index().iteritems()]
)
for cluster in tqdm(df['cluster'].sort_values().unique()):
    dff = df[df['cluster'] == cluster]
    dff = dff.sample(n=min(400, len(dff))) # draw 400 traces for each cluster
    for _, row in dff.iterrows():
        u_in = row['u_in']
        times = pd.DataFrame(row['time_delta'])
        times = np.cumsum(time_delta_scaler.inverse_transform(times))
        fig.add_trace(
            go.Scatter(x=times, y=u_in, opacity=0.05, marker={'color': '#0000FF'}),
            row=cluster//4+1, col=cluster%4+1
        )
fig.update_layout(
    showlegend=False,
)
fig.write_html(plot_path / 'clustering_DBSCAN_traces.html')

In [29]:
pd.DataFrame({
    'train': df[['cluster', 'breath_id']].groupby('cluster').count().breath_id / len(df),
    'test': df_test[['cluster', 'breath_id']].groupby('cluster').count().breath_id / len(df_test),
}) *100


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



Unnamed: 0_level_0,train,test
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.671306,1.755467
1,17.774685,17.421471
2,1.908549,2.019881
3,4.861498,4.916501
4,1.420808,1.312127
5,2.629556,2.761431
6,30.269052,30.451292
7,11.013917,10.95825
8,1.455268,1.407555
9,0.776673,0.791252


In [14]:
dff = df.sample(2000)
fig = go.Figure()
fig.add_trace(go.Scatter(x=dff['breath_id'], y=dff['R'], mode='markers'))
fig.add_trace(go.Scatter(x=dff['breath_id'], y=dff['C'] + 2, mode='markers'))
fig.add_trace(go.Scatter(x=dff['breath_id'], y=dff['cluster'] + 4, mode='markers'))
fig.write_html('plots/breath_id_R_C_clusters.html') # R, C and clusters are randomly distributed, no id hacking possible

### Baseline average

In [None]:
def compute_metric(prediction, df):
    df = df.copy()
    df['metric'] = (df['pressure'] - prediction).abs()
    df = df[df['u_out'] == 0]
    return df['metric'].mean(), df.groupby('breath_id').mean().sort_values('metric')

y_train = [df_train['pressure'].mean()] * len(df_train)
y_valid = [df_train['pressure'].mean()] * len(df_valid)

baseline_train_metric, _ = compute_metric(y_train, df_train)
baseline_valid_metric, _ = compute_metric(y_valid, df_valid)

print(f'Baseline train: {baseline_train_metric}\nBaseline valid: {baseline_valid_metric}')

## Keeping the timestamp-row structure

In [None]:
n_breath_train = len(df_train['breath_id'].unique())
n_breath_valid = len(df_valid['breath_id'].unique())

### Add info about past

In [None]:
starting_pressure = df_train.loc[df_train['time_step'] == 0, 'pressure'].mean() # this is improvable, the mean is not so good
starting_time_delta = 0.0331 # the mean, mah, maybe improvable

df_train['pressure-1'] = df_train.groupby('breath_id')['pressure'].shift(1, fill_value=starting_pressure)
df_train['time_delta-1'] = df_train.groupby('breath_id')['time_step'].shift(1, fill_value=-starting_time_delta) - df_train['time_step']
df_train['u_in-1'] = df_train.groupby('breath_id')['u_in'].shift(1).fillna(method='backfill')

df_train['pressure-2'] = df_train.groupby('breath_id')['pressure'].shift(2, fill_value=starting_pressure)
df_train['time_delta-2'] = (df_train.groupby('breath_id')['time_step'].shift(2) - df_train['time_delta-1']).fillna(starting_time_delta)
df_train['u_in-2'] = df_train.groupby('breath_id')['u_in'].shift(2).fillna(method='backfill')

df_valid['pressure-1'] = df_valid.groupby('breath_id')['pressure'].shift(1, fill_value=starting_pressure)
df_valid['time_delta-1'] = df_valid.groupby('breath_id')['time_step'].shift(1, fill_value=-starting_time_delta) - df_valid['time_step']
df_valid['u_in-1'] = df_valid.groupby('breath_id')['u_in'].shift(1).fillna(method='backfill')

df_valid['pressure-2'] = df_valid.groupby('breath_id')['pressure'].shift(2, fill_value=starting_pressure)
df_valid['time_delta-2'] = (df_valid.groupby('breath_id')['time_step'].shift(2) - df_valid['time_delta-1']).fillna(starting_time_delta)
df_valid['u_in-2'] = df_valid.groupby('breath_id')['u_in'].shift(2).fillna(method='backfill')


### Linear regression

In [None]:
useful_features = ['R', 'C', 'u_in', 'u_out', 'pressure-1', 'u_in-1', 'time_delta-1', 'pressure-2', 'u_in-2', 'time_delta-2']

regressor = LinearRegression().fit(df_train[useful_features], df_train['pressure'])
y_train = regressor.predict(df_train[useful_features])
y_valid = regressor.predict(df_valid[useful_features])

train_metric, _ = compute_metric(y_train, df_train)
valid_metric, _ = compute_metric(y_valid, df_valid)

print(f'train: {train_metric}\nvalid: {valid_metric}')

### Decision Tree Regressor

In [None]:
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(df_train[useful_features], df_train['pressure'])
y_train = regressor.predict(df_train[useful_features])
y_valid = regressor.predict(df_valid[useful_features])

train_metric, _ = compute_metric(y_train, df_train)
valid_metric, _ = compute_metric(y_valid, df_valid)

print(f'train: {train_metric}\nvalid: {valid_metric}')

### Random Forest Regressor

In [None]:
regressor = RandomForestRegressor(criterion='absolute_error', random_state=0, verbose=2)
regressor.fit(df_train[useful_features], df_train['pressure'])
y_train = regressor.predict(df_train[useful_features])
y_valid = regressor.predict(df_valid[useful_features])

train_metric, _ = compute_metric(y_train, df_train)
valid_metric, _ = compute_metric(y_valid, df_valid)

print(f'train: {train_metric}\nvalid: {valid_metric}')

In [None]:
instant_predictions = []
pressure_1 = [starting_pressure] * n_breath_train
pressure_2 = [starting_pressure] * n_breath_train
for index in range(80):
    df_instant = df_train[(df_train['id']-1)%80 == index].copy()
    df_instant['pressure-1'] = pressure_1
    df_instant['pressure-2'] = pressure_2
    y_instant = regressor.predict(df_instant[useful_features])
    instant_predictions.append(pd.Series(y_instant, index=df_instant.index))
    pressure_1 = y_instant
    pressure_2 = pressure_1
y_train = pd.concat(instant_predictions).sort_index().values

instant_predictions = []
pressure_1 = [starting_pressure] * n_breath_valid
pressure_2 = [starting_pressure] * n_breath_valid
for index in range(80):
    df_instant = df_valid[(df_valid['id']-1)%80 == index].copy()
    df_instant['pressure-1'] = pressure_1
    df_instant['pressure-2'] = pressure_2
    y_instant = regressor.predict(df_instant[useful_features])
    instant_predictions.append(pd.Series(y_instant, index=df_instant.index))
    pressure_1 = y_instant
    pressure_2 = pressure_1
y_valid = pd.concat(instant_predictions).sort_index().values


train_metric, _ = compute_metric(y_train, df_train)
valid_metric, _ = compute_metric(y_valid, df_valid)

print(f'train: {train_metric}\nvalid: {valid_metric}')

# Pressure Prediction

## Pytorch Preparation

In [3]:
class SequencesDataset(Dataset):
    def __init__(self, df_path):
        with open(df_path, 'rb') as handle:
            df, pressures = pickle.load(handle)
        self.X = df[['u_in', 'time_delta', 'R', 'C']].values
        self.Y = pressures.drop(columns='breath_id').values
        for index in range(7):
            df[index] = (df['index_out'] > (25 + index)).astype(int)
        self.extra = df[['breath_id', 'index_out', 'cluster'] + list(range(7))].values

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.Tensor(self.X[idx]), torch.Tensor(self.Y[idx]), torch.Tensor(self.extra[idx])


train_data = SequencesDataset('output/flat_train.pickle')
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

valid_data = SequencesDataset('output/flat_valid.pickle')
valid_dataloader = DataLoader(valid_data, batch_size=64, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cpu device


## Model Definition

In [22]:
class NeuralNetwork(nn.Module):
    def __init__(self, dropout_p):
        super(NeuralNetwork, self).__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(66, 140), # 32 u_in, 32 time_delta, R, C
            nn.ReLU(),
            nn.Dropout(p=dropout_p),

            nn.Linear(140, 140),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),

            nn.Linear(140, 140),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),

            nn.Linear(140, 140),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),

            nn.Linear(140, 140),
            nn.ReLU(),
            nn.Linear(140, 32),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

dropout_p = 0.5
model = NeuralNetwork(dropout_p).to(device)
print(model)
writer = SummaryWriter('runs/experiment_1')
writer.add_graph(model, next(iter(train_dataloader))[0])

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=66, out_features=140, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=140, out_features=140, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=140, out_features=140, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=140, out_features=140, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=140, out_features=140, bias=True)
    (13): ReLU()
    (14): Linear(in_features=140, out_features=32, bias=True)
  )
)


## Training

In [23]:
learning_rate = 3e-4
batch_size = 64
epochs = 3000

loss_function = nn.MSELoss(reduction='sum')

def fixed_loss_function(Y, prediction, extra):
    prediction[:,-7:] *= extra[:,-7:] # put to 0 prediction during expiration phase in order to 0 their grad
    Y[:,-7:] *= extra[:,-7:] # put to 0 target during expiration phase in order to not increase the loss
    return loss_function(prediction, Y) / (extra[:,-7:].sum() + 25 * Y.shape[0]) # divide only for the number of cases in inspiration phase

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [24]:
size = len(train_dataloader.dataset)
train_losses = []
valid_losses = []

interactive_fig = go.FigureWidget()
interactive_fig.add_trace(go.Scatter(
    y=[], name='train'
))
interactive_fig.add_trace(go.Scatter(
    y=[], name='valid'
))
interactive_fig.update_yaxes(rangemode='nonnegative')
display(interactive_fig)


for epoch in tqdm(range(epochs)):
    train_loss = 0
    for X, Y, extra in train_dataloader:
        prediction = model(X)
        loss = fixed_loss_function(prediction, Y, extra)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    train_losses.append(train_loss / len(train_dataloader))
    writer.add_scalar('train loss', train_loss / len(train_dataloader), epoch)
    
    valid_loss = 0
    with torch.no_grad():
        for X, Y, extra in valid_dataloader:
            prediction = model(X)
            valid_loss += fixed_loss_function(prediction, Y, extra).item()
    if not valid_losses or min(valid_losses) > valid_loss / len(valid_dataloader):
        best_model_parameters = model.state_dict()
    valid_losses.append(valid_loss / len(valid_dataloader))
    writer.add_scalar('valid loss', valid_loss / len(valid_dataloader), epoch)

    with interactive_fig.batch_update():
        interactive_fig.data[0].y = train_losses
        interactive_fig.data[1].y = valid_losses

model.load_state_dict(best_model_parameters)
torch.save(model, model_path / 'model.pth')
writer.close()

FigureWidget({
    'data': [{'name': 'train', 'type': 'scatter', 'uid': '3a72c0f7-ee68-41eb-b98c-e76ea1b3ee58'â€¦

  0%|          | 0/3000 [00:00<?, ?it/s]

In [15]:
X, Y, extra = train_data[0:10]


# writer.add_embedding(
#     X,
#     metadata=extra[:,2],
#     label_img=extra[:,0]
# )


IndexError: tuple index out of range

# Prediction Analysis

In [189]:
with open('output/reverse_transform.pickle', 'rb') as handle:
    time_delta_scaler, pressure_delta_scaler = pickle.load(handle)
valid_final_dataloader = DataLoader(valid_data, batch_size=len(valid_data), shuffle=False)
metric_function = nn.L1Loss(reduction='none')

with torch.no_grad():
    X, Y, extra = next(iter(valid_final_dataloader))
    prediction = model(X)
    full_metric = metric_function(prediction, Y).numpy() * pressure_delta_scaler.data_range_.item()
full_metric[:,-7:][extra[:,-7:].numpy() == 0] = np.nan
full_metric = pd.DataFrame(full_metric, index=extra[:,0].numpy())
valid_error = full_metric.sum().sum() / full_metric.notna().sum().sum()
print(f'Error on validation: {valid_error}')

R_C_cluster = pd.DataFrame({'R': X[:,-2].numpy(), 'C': X[:,-1].numpy(), 'cluster': extra[:,2].numpy()}, index=extra[:,0].numpy())

Error on validation: 0.7446274332825468


## Error by R C

In [190]:
fig = make_subplots(rows=3, cols=3, subplot_titles=[f'R={R} C={C}' for R in [0, 0.5, 1] for C in [0, 0.5, 1]])
for R in [0, 0.5, 1]:
    for C in [0, 0.5, 1]:
        filtered_metric = full_metric[(R_C_cluster['R'] == R) & (R_C_cluster['C'] == C)]
        fig.add_trace(
            go.Scatter(y=filtered_metric.mean()),
            row=int(R*2+1), col=int(C*2+1)
        )
fig.update_layout(height=600, margin={'t':30, 'l':30, 'b':30, 'r':30}, showlegend=False)
fig

## Error by Cluster

In [191]:
fig = make_subplots(rows=5, cols=4,
    subplot_titles=[f'Cluster {int(cluster)} N={size}' for cluster, size in R_C_cluster['cluster'].value_counts().sort_index().iteritems()]
)
for cluster in R_C_cluster['cluster'].sort_values().unique():
    cluster = int(cluster)
    filtered_metric = full_metric[R_C_cluster['cluster'] == cluster]
    fig.add_trace(
        go.Scatter(y=filtered_metric.mean()),
        row=cluster//4+1, col=cluster%4+1
    )
fig.update_layout(height=600, margin={'t':30, 'l':30, 'b':30, 'r':30}, showlegend=False)
fig

## Plot single Breath

In [None]:
with open('output/flat_valid.pickle', 'rb') as handle:
    dff, pressures = pickle.load(handle)

def plot_breath(breath_id, df, pressures, prediction=None):
    df.index = df['breath_id']
    pressures.index = pressures['breath_id']
    row = df.loc[breath_id]
    u_in = row['u_in']
    times = pd.DataFrame(row['time_delta'])
    times = np.cumsum(time_delta_scaler.inverse_transform(times))
    pressures = pressures.loc[breath_id][1:]
    prediction = prediction[0].numpy()
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=times, y=u_in, opacity=0.5, marker={'color': '#0000FF'}, name='u_in', mode='lines+markers'
    ))
    fig.add_vline(x=times[int(row['index_out'])-1].item())
    fig.add_trace(go.Scatter(
        x=times, y=prediction, opacity=0.5, marker={'color': '#FF3333'}, name='prediction', mode='lines+markers'
    ))
    fig.add_trace(go.Scatter(
        x=times, y=pressures, opacity=0.7, marker={'color': 'red'}, name='target', mode='lines+markers'
    ))
    fig.update_layout(
        hovermode='x unified',
        title=f"breath_id: {row['breath_id'].item()} R={row['R'].item()} C={row['C'].item()}"
    )
    return fig


plot_breath(dff.index[6], dff, pressures, prediction)

## Old Visualization

In [None]:
def plot_breath(df, breath_index, prediction=None):
    fig = go.Figure()
    df['prediction'] = prediction
    one = df[df['breath_id'] == breath_index]
    fig.add_trace(go.Scatter(
        x=one['time_step'], y=one['u_in'], opacity=0.5, marker={'color': '#0000FF'}, name='u_in', mode='lines+markers'
    ))
    fig.add_trace(go.Scatter(
        x=one['time_step'], y=one['u_out']*20, opacity=0.5, marker={'color': 'green'}, name='u_out', mode='lines+markers'
    ))
    fig.add_trace(go.Scatter(
        x=one['time_step'], y=one['prediction'], opacity=0.5, marker={'color': '#FF3333'}, name='prediction', mode='lines+markers'
    ))
    fig.add_trace(go.Scatter(
        x=one['time_step'], y=one['pressure'], opacity=0.7, marker={'color': 'red'}, name='target', mode='lines+markers'
    ))
    fig.update_layout(
        hovermode='x unified',
        title=f"Breath ID: {breath_index} R={one['R'].iloc[0]} C={one['C'].iloc[0]}"
    )
    return fig

metrics = _
breath_index = df_valid['breath_id'].unique()[0]

# breath_index = metrics.index[-500]
# fig = plot_breath(df_valid, breath_index)
fig = plot_breath(df_train, 4)
fig.show()
# weird breaths:
# - long time-delta: 24127, 55851, 72104
# - negative pressure: 542, 77803, 112036, 45099

# Test

In [192]:
# Load data and model
with open('output/reverse_transform.pickle', 'rb') as handle:
    _, pressure_scaler = pickle.load(handle)
test_data = SequencesDataset('output/flat_test.pickle')
test_dataloader = DataLoader(test_data, batch_size=len(test_data), shuffle=False)
model = torch.load('output/model.pth')

# Compute predictions
with torch.no_grad():
    X, Y, extra = next(iter(test_dataloader))
    prediction = model(X)

# Inverse preprocessing to reconstruct pressure column
pressures = pd.DataFrame(prediction.numpy())
for index in range(32, 80):
    pressures[index] = 0
pressures = pressures.transpose()
pressures = pd.concat([pressures[column] for column in pressures])
pressures = pressure_scaler.inverse_transform(pd.DataFrame(pressures))
submission = pd.DataFrame({'id': range(1, len(pressures) + 1), 'pressure': pressures[:,0]})

# Save submission as .csv and .gz
submission.to_csv('output/submission.csv', index=False)
with open('output/submission.csv', 'rb') as handle:
    text = handle.read()
with gzip.open('output/submission.gz', 'wb') as handle:
    handle.write(text)