In [34]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import pickle
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [3]:
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
with open('zebal_version2.pickle', 'rb') as f:
    file = pickle.load(f)

In [5]:
token = file['morphs']
label = file['type']

In [6]:
token_list = token.to_list()

In [7]:
label_list = label.to_list()

In [8]:
label_dict = {type : i for i, type in enumerate(label.unique())}

In [9]:
def make_dict(sentences):
    word_dict = dict()
    cnt = 0
    for sentence in sentences:
        for word in sentence:
            if word not in word_dict:
                word_dict[word] = cnt
                cnt += 1
    print('Total number of unique tokens :', len(word_dict))
    return word_dict

In [10]:
word_dict = make_dict(token_list)

Total number of unique tokens : 81580


In [11]:
class CustomDataset(Dataset):
    def __init__(self, word_dict, label_dict, sentences, labels):
        super(CustomDataset, self).__init__()
        self.sentences = sentences
        self.labels = labels
        self.word_dict = word_dict
        self.label_dict = label_dict
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, ix):
        sent = self.sentences[ix]
        sent_ix = []
        for word in sent:
            sent_ix.append(self.word_dict[word])
        sent_ix = torch.LongTensor(sent_ix)
        label = torch.tensor(self.label_dict[self.labels[ix]])
        return sent_ix, label

In [12]:
class Classifier(nn.Module):
    def __init__(self, num_token, num_label, emb_dim):
        super(Classifier, self).__init__()
        self.embedding = nn.Embedding(num_token, emb_dim)
        self.lstm = nn.LSTM(emb_dim, 300, batch_first=True)
        self.fc = nn.Linear(300, num_label)
        
    def forward(self, batch):
        lengths = []
        for bat in batch:
            lengths.append(len(bat[0]))
        max_len = int(np.max(lengths))
        sent = torch.zeros((len(batch), max_len), dtype=torch.long)
        for i, bat in enumerate(batch):
            sent[i, :lengths[i]] = bat[0]
        
        lengths = torch.LongTensor(lengths)
        sent = self.embedding(sent)
        packed_sent = pack_padded_sequence(sent, lengths, batch_first=True, enforce_sorted=False)
        
        packed_out, _ = self.lstm(packed_sent)
        out, out_len = pad_packed_sequence(packed_out, batch_first=True)
        logits = self.fc(out[:,-1])
        return logits

In [14]:
trainset = CustomDataset(word_dict, label_dict, token_list, label_list)

In [15]:
trainloader = DataLoader(trainset, collate_fn=lambda x: x, batch_size=2, shuffle=True)

In [16]:
classifier = Classifier(len(word_dict), 16, 100)

In [38]:
def train(model, dataloader, num_epoch=5, print_every=10):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(num_epoch):
        
        training_loss = 0.0
        
        for i, batch in enumerate(dataloader):
            sent_ix, label = batch
                        
            logits = model(sent_ix)
            loss = criterion(logits, label)
            
            loss.backward()
            optimizer.zero_grad()
            optimizer.step()
            
            if (i+1) % print_every == 0:
                print('Epoch %d | Step %d | Loss %0.4f' %(epoch+1, i+1, training_loss / print_every))
                training_loss = 0.0
                
    return model

In [39]:
train(classifier, trainloader)

TypeError: len() of a 0-d tensor