In [9]:
import numpy as np
import pandas as pd

import sys
sys.path.insert(0,'../')

import torch
import torch.nn as nn
from torch.optim import Adam
from torchtext import data
from torchtext import vocab
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path
from sklearn.metrics import accuracy_score

from params import SEED
from models import CNN
import training_utils as tu

In [2]:
train = pd.read_json('../data/json/train.json', orient='records', lines=True)
valid = pd.read_json('../data/json/valid.json', orient='records', lines=True)

In [3]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')
torch.cuda.set_device(2)

In [16]:
TEXT = data.Field(preprocessing=tu.make_padder(5))
LABEL = data.LabelField()
train_fields = {'category_id': ('label', LABEL), 'tokens': ('text', TEXT)}

data_path = '../data/json'

custom_embeddings = vocab.Vectors(name = '../embeddings/custom/custom.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

train_data = data.TabularDataset(
    path=Path(data_path, f'train.json'),
    format='json',
    fields=train_fields,
)

ID = data.Field(sequential=False, use_vocab=False)
test_fields = {'category_id': ('label', LABEL), 'item_id': ('id', ID), 'tokens': ('text', TEXT)}
test_data = data.TabularDataset(
    path=Path(data_path, f'valid.json'),
    format='json',
    fields=test_fields,
)

TEXT.build_vocab(train_data, vectors=custom_embeddings, unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [17]:
model_path = '../trained_models/cnn_valid'
n_epochs = 10
batch_size = 64 

input_dim = len(TEXT.vocab)
output_dim = len(LABEL.vocab)
embedding_dim = custom_embeddings.dim
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

n_filters = 3
filter_sizes = [3, 4, 5]
dropout = 0.5


model = CNN(input_dim,
            output_dim,
            embedding_dim,
            n_filters,
            filter_sizes,
            dropout,
            pad_idx)

In [18]:
embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(embeddings)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [19]:
train_iterator = data.BucketIterator(train_data,
                                     batch_size=batch_size,
                                     sort_key=lambda ex:len(ex.text),
                                     sort_within_batch=True,
                                     device=device
                                    )

test_iterator = data.BucketIterator(test_data,
                                     batch_size=batch_size,
                                     sort_key=lambda ex:len(ex.text),
                                     sort_within_batch=True,
                                     device=device
                                    )

In [20]:
optimizer = Adam(model.parameters())
critertion = nn.CrossEntropyLoss().to(device)
model = model.to(device)

In [21]:
writer = SummaryWriter('../runs/final_valid')

In [22]:
best, final = tu.train_model(model,
            train_iterator,
            test_iterator,
            optimizer,
            critertion,
            model_path,
            n_epochs,
            '_',
            writer)

In [23]:
best, final

(0.76730664489316, 0.7632659313725491)

In [20]:
model.load_state_dict(torch.load(model_path))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])