In [1]:
import torchtext.data as data
import numpy as np
import training_utils as tu
from models import CNN

In [2]:
process_text = {
    
}

process_labels = {
    
}

TEXT = data.Field(**process_text)
LABEL = data.LabelField(**process_labels)
fields = {'category_id': ('label', LABEL), 'lemmatized': ('text', TEXT)}

In [3]:
train_data, valid_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        format = 'json',
                                        fields = fields
)

test_data = data.TabularDataset(
                                path = 'data/test.json',
                                format = 'json',
                                fields = {'lemmatized': ('text', TEXT)}
)

In [4]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f'Vocab size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocab size: 25002
Number of classes: 54


In [5]:
import torch

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(3)

In [6]:
BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda ex: len(ex.text),
    sort_within_batch = False,
    device = device)

In [7]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [8]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [11]:
from tensorboardX import SummaryWriter

In [9]:
tu.count_parameters(model)

2606754

In [13]:
from importlib import reload

In [35]:
reload(tu)
tu.train_model(model, train_iterator, valid_iterator,
            optimizer, criterion, 'cnn', n_epochs=10, writer=writer)

0.5011529434009705 0.8666802832114151 0.5054102764507683 0.858852072223025
Epoch: 1 | Epoch Time: 3m 42:s 
0.49947264530251617 0.8661662581699346 0.48390491558827886 0.8636229300561304
Epoch: 2 | Epoch Time: 3m 47:s 
0.5003589385183029 0.866687091503268 0.4697732941633659 0.8668407442951966
Epoch: 3 | Epoch Time: 3m 40:s 
0.5026650453266365 0.866775599141526 0.45534530311425947 0.8700403071322614
Epoch: 4 | Epoch Time: 3m 39:s 
0.5123473723376402 0.8695261437908497 0.44048510482715886 0.8736511125617565
Epoch: 5 | Epoch Time: 3m 16:s 
0.5183595974144398 0.8664896514291077 0.42890223294184826 0.8762959332554963
Epoch: 6 | Epoch Time: 3m 21:s 
0.5194487119945728 0.8685117102526372 0.41661275775311996 0.8790262759871529
Epoch: 7 | Epoch Time: 3m 16:s 
0.5289916257233986 0.8690767973856209 0.4063213388039057 0.8820648413698821
Epoch: 8 | Epoch Time: 3m 21:s 
0.5380840159721332 0.8697372004487156 0.3972889748073945 0.8842733865724044
Epoch: 9 | Epoch Time: 3m 18:s 
0.5467872943327415 0.8688

In [37]:
train_data.kf

(<torchtext.data.dataset.Dataset at 0x7f56f50ce320>,
 <torchtext.data.dataset.Dataset at 0x7f56f50ce668>)

In [None]:
writer