In [1]:
from pathlib import Path

import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from torcheval.metrics.functional import multiclass_f1_score

from src.common import get_device
from src.dataset import (collate_fn, load_dataset, make_vectorizer,
                         remove_stop_words, remove_symbols, rename_columns,
                         split, tokenize)
from src.model import LSTM
from src.trainer import Trainer

In [2]:
device = get_device()

In [4]:
dataset_path = Path('data/train.csv')
submission_data_path = Path('data/test.csv')
submission_path = Path('submission.csv')

dataset = load_dataset(dataset_path)
submission_dataset = load_dataset(submission_data_path)

dataset = remove_stop_words(dataset)
submission_dataset = remove_stop_words(submission_dataset)

dataset = remove_symbols(dataset)
submission_dataset = remove_symbols(submission_dataset)

vectorizer = make_vectorizer(dataset)

tokenized_dataset = tokenize(vectorizer, dataset)
submission_dataset = tokenize(vectorizer, submission_dataset)

train_dataset, test_dataset = split(tokenized_dataset)

train_dataset = rename_columns(train_dataset)
test_dataset = rename_columns(test_dataset)
submission_dataset = rename_columns(submission_dataset)

train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)
submission_dataset = Dataset.from_pandas(submission_dataset)

In [5]:
num_epoch = 5
batch_size = 2
hidden_dim = 100

reducing_data = 0
if reducing_data:
    train_dataset = train_dataset.select(range(reducing_data))
    test_dataset = test_dataset.select(range(reducing_data))
    submission_dataset = submission_dataset.select(range(reducing_data))

model = LSTM(hidden_dim=hidden_dim, vocab_size=len(vectorizer.vocabulary_), num_classes=4)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, shuffle=False, collate_fn=collate_fn, batch_size=batch_size)

trainer = Trainer(model=model,
                  criterion=criterion,
                  optimizer=optimizer,
                  device=device,
                  train_dataloader=train_dataloader,
                  test_dataloader=test_dataloader,
                  eval_metric=multiclass_f1_score,
                  num_epoch=num_epoch,
                  verbose=True
                  )

trainer.train()

../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

## Submission code

In [34]:
best_model = trainer.best_model

submission_dataloader = DataLoader(submission_dataset, shuffle=False, collate_fn=collate_fn, batch_size=batch_size)

submission_evaluator= Trainer(model=best_model,
                  criterion=criterion,
                  device=device,
                  test_dataloader=submission_dataloader,
                  eval_metric=multiclass_f1_score
                  )
_, predictions = submission_evaluator.eval()

In [35]:
predictions = predictions.clone().detach().tolist()
res = pd.DataFrame(list(zip(range(len(predictions)), predictions)),  columns=['Id', 'Class Index']).reset_index(drop=True)
res.to_csv(submission_path, index=False)