In [11]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import Dataset

from datetime import datetime, timedelta
from tqdm.auto import tqdm
tqdm.pandas()

In [21]:
test_path = './test.parquet'
test_df = pd.read_parquet(test_path)
test_df['new_dates'] = test_df['dates'].progress_apply(lambda x: pd.date_range(start=x.min(), end=x.max(), freq='M').date)

100%|██████████| 20000/20000 [00:32<00:00, 608.24it/s]


In [26]:
def adjust_dates(dates):
  if len(dates) < median:
    first_date = dates[0]
    dif = median - len(dates)
    for i in range(dif):
      first_date -= timedelta(days=30)
      dates = np.insert(dates, 0, first_date)

  return dates[:median]

def new_values(row):
  return np.interp(row['new_dates'], row['dates'], row['values'])

def min_max_scale(sample):
  return (sample.new_values - sample.min_values) / (sample.max_values - sample.min_values)

def fill_nan(row, fill_array):
  if row.isnan_values == 1:
    return fill_array
  return row.scaled_values

In [23]:
median = int(np.median(test_df.new_dates.map(len)))
test_df['new_dates'] = test_df['new_dates'].progress_apply(adjust_dates)

test_df['new_dates'] = test_df.new_dates.map(lambda x: pd.DatetimeIndex(x))
test_df['dates'] = test_df.dates.map(lambda x: pd.DatetimeIndex(x))
test_df['new_values'] = test_df.progress_apply(lambda x: new_values(x), axis=1)

100%|██████████| 20000/20000 [00:03<00:00, 5126.27it/s]
100%|██████████| 20000/20000 [00:01<00:00, 15599.10it/s]


In [24]:
test_df['min_values'] = test_df.new_values.map(min)
test_df['max_values'] = test_df.new_values.map(max)
test_df['scaled_values'] = test_df.progress_apply(lambda x: min_max_scale(x), axis=1)

  return (sample.new_values - sample.min_values) / (sample.max_values - sample.min_values)
100%|██████████| 20000/20000 [00:01<00:00, 15104.97it/s]


In [25]:
test_df['isnan_values'] = test_df['scaled_values'].map(lambda x: np.isinf(x).any() or np.isnan(x).any()).astype(int)

In [27]:
counts_test = [0] * median
for i, row in tqdm(test_df[test_df['isnan_values'] == 0].iterrows()):
  for j in range(median):
    values = row.scaled_values
    counts_test[j] += values[j]

19978it [00:20, 998.60it/s] 


In [28]:
counts_test = np.array(counts_test) / len(test_df[test_df['isnan_values'] == 0])
test_df['scaled_values'] = test_df.progress_apply(lambda x: fill_nan(x, counts_test), axis=1)

100%|██████████| 20000/20000 [00:00<00:00, 29691.18it/s]


In [29]:
test_dataset = Dataset.from_pandas(test_df[['id', 'scaled_values']])
test_dataset.set_format(type='pt')

In [30]:
BATCH_SIZE = 128
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [33]:
class Simple1DCNN(nn.Module):
  def __init__(self, input_channels, output_size):
    super(Simple1DCNN, self).__init__()

    self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=3, padding=1)
    self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
    self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1)

    self.pool = nn.MaxPool1d(kernel_size=2)

    self.fc1 = nn.Linear(64 * 7, 256)
    self.fc2 = nn.Linear(256, 128)
    self.fc3 = nn.Linear(128, output_size)

    self.relu = nn.ReLU()
    self.batch_norm1 = nn.BatchNorm1d(16)
    self.batch_norm2 = nn.BatchNorm1d(32)
    self.batch_norm3 = nn.BatchNorm1d(64)

  def forward(self, x):
    x = x.unsqueeze(1)
    x = self.relu(self.conv1(x))
    x = self.batch_norm1(self.pool(x))

    x = self.relu(self.conv2(x))
    x = self.batch_norm2(self.pool(x))

    x = self.relu(self.conv3(x))
    x = self.batch_norm3(self.pool(x))
    x = x.view(x.size(0), -1)

    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.fc3(x)

    return x

In [35]:
model = torch.load('./model.pth', weights_only=False, map_location=torch.device('cpu'))
model.eval()

Simple1DCNN(
  (conv1): Conv1d(1, 16, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=448, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (batch_norm1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [37]:
total_predictions, ids = [], []
sgm = nn.Sigmoid()
with torch.no_grad():
  for batch_idx, batch in tqdm(enumerate(test_dataloader)):
      data = batch['scaled_values'].to(device)
      ids = np.append(ids, batch['id'])
      output = sgm(model(data))
      predictions = output[:, 1]
      total_predictions = np.append(total_predictions, predictions.cpu().detach().numpy())

157it [00:02, 64.93it/s]


In [38]:
submission = pd.DataFrame({'id': ids, 'score': total_predictions})
submission.to_csv('submission.csv', index=False)

In [39]:
submission

Unnamed: 0,id,score
0,6125.0,1.008047e-08
1,26781.0,1.528109e-02
2,13333.0,9.180045e-01
3,53218.0,2.036845e-04
4,84204.0,4.039332e-04
...,...,...
19995,80341.0,3.587674e-04
19996,5891.0,5.761599e-03
19997,29091.0,9.931400e-04
19998,85877.0,9.025795e-01
