In [2]:
import torchtext.data as data
import numpy as np
import training_utils as tu
from models import CNN

In [35]:
process_text = {
    
}

process_labels = {
    
}

TEXT = data.Field(**process_text)
LABEL = data.LabelField(**process_labels)
NUM = data.Field(sequential=False)
fields = {'category_id': ('label', LABEL), 'lemmatized': ('text', TEXT)}

In [10]:
fields.update({f'cat_{i}': (f'j{i}',NUM )
                                                                                 for i in range(54)})

In [36]:
train_data, valid_data = data.TabularDataset.splits(
                                        path = 'data/json',
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        format = 'json',
                                        fields = fields
)

test_data = data.TabularDataset(
                                path = 'data/json/test.json',
                                format = 'json',
                                fields = {'lemmatized': ('text', TEXT)}
)

In [37]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f'Vocab size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocab size: 25002
Number of classes: 54


In [38]:
import torch

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(3)

In [39]:
BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda ex: len(ex.text),
    sort_within_batch = False,
    device = device)

In [40]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [41]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [42]:
from tensorboardX import SummaryWriter

In [43]:
tu.count_parameters(model)

2606754

In [44]:
writer=SummaryWriter('runs/cnn')

In [45]:
from importlib import reload

In [46]:
reload(tu)
tu.train_model(model, train_iterator, valid_iterator,
            optimizer, criterion, 'cnn', n_epochs=10, writer=writer)

0.646408083859612 0.8250714869281046 1.3005663739843956 0.6668195709973345
Epoch: 1 | Epoch Time: 2m 46:s 
0.5632608925181082 0.8463541666666666 0.6999461936500654 0.8112388532483942
Epoch: 2 | Epoch Time: 2m 49:s 
0.523290915887048 0.8567197712418301 0.6019869379931795 0.8348984887132771
Epoch: 3 | Epoch Time: 2m 49:s 
0.5072365258789919 0.8630446622963824 0.5572728491670595 0.8457327050347601
Epoch: 4 | Epoch Time: 2m 49:s 
0.5018985830466537 0.8641986655643563 0.525136128684083 0.853390716964814
Epoch: 5 | Epoch Time: 2m 49:s 
0.5010503985986016 0.8650054466101079 0.501704175554278 0.8589810041415767
Epoch: 6 | Epoch Time: 2m 49:s 
0.5033462204282579 0.8656522331284542 0.48441467355997037 0.8629314279727714
Epoch: 7 | Epoch Time: 2m 49:s 
0.5035752951222308 0.8692572167885849 0.46688827204780264 0.8672641265877401
Epoch: 8 | Epoch Time: 2m 49:s 
0.5064624795385825 0.868634259272245 0.4506235413739135 0.8713451567808969
Epoch: 9 | Epoch Time: 2m 49:s 
0.5119308485728659 0.86620370371

In [37]:
train_data.kf

(<torchtext.data.dataset.Dataset at 0x7f56f50ce320>,
 <torchtext.data.dataset.Dataset at 0x7f56f50ce668>)

In [None]:
writer