In [1]:
import torch, torchdata, torchtext
import time

from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### 1. ETL

In [4]:
from torchtext.datasets import AG_NEWS

In [5]:
train, test = AG_NEWS()

In [6]:
print(len(list(iter(train))))
print(len(list(iter(test))))

120000
7600


### 2. EDA

In [7]:
next(iter(train))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [8]:
set([y for y,x in list(iter(train))])



{1, 2, 3, 4}

In [9]:
train_size = len(list(iter(train)))
train_size ## dataset too big need to reduce

120000

In [10]:
_, train, valid = train.random_split(total_length=train_size,
                                     weights = {'too much': 0.7,"train": 0.2, "val": 0.1},
                                     seed = SEED )
                                    # random_split returns tuple

### 3. Preprocessing

In [11]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_md')

In [12]:
tokens = tokenizer('We are currently learning LSTM in youtube!')
tokens

['We', 'are', 'currently', 'learning', 'LSTM', 'in', 'youtube', '!']

#### Text to integers

In [13]:
from torchtext.vocab import build_vocab_from_iterator 

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [14]:
train_iter = yield_tokens(train)
print(next(train_iter))

['Safety', 'Net', '(', 'Forbes.com', ')', 'Forbes.com', '-', 'After', 'earning', 'a', 'PH.D.', 'in', 'Sociology', ',', 'Danny', 'Bazil', 'Riley', 'started', 'to', 'work', 'as', 'the', 'general', 'manager', 'at', 'a', 'commercial', 'real', 'estate', 'firm', 'at', 'an', 'annual', 'base', 'salary', 'of', ' ', '#', '36;70,000', '.', 'Soon', 'after', ',', 'a', 'financial', 'planner', 'stopped', 'by', 'his', 'desk', 'to', 'drop', 'off', 'brochures', 'about', 'insurance', 'benefits', 'available', 'through', 'his', 'employer', '.', 'But', ',', 'at', '32', ',', '"', 'buying', 'insurance', 'was', 'the', 'furthest', 'thing', 'from', 'my', 'mind', ',', '"', 'says', 'Riley', '.']


In [15]:
vocab = build_vocab_from_iterator(yield_tokens(train), specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab['<unk>'])



In [16]:
# integers to text

mapping = vocab.get_itos()

mapping[0]

'<unk>'

In [17]:
len(vocab)

52686

### 4. FastText embeddings

In [18]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

In [19]:
fast_embeddings = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [20]:
len(fast_embeddings[1])

300

In [21]:
fast_embeddings.shape # vocab, emb_dim

torch.Size([52686, 300])

In [22]:
type(fast_embeddings)

torch.Tensor

### 5. Preparing the dataloader

In [23]:
text_pipeline = lambda x: vocab(tokenizer(x)) # tokenized then numericalized

In [24]:
text_pipeline('I am currently learning LSTM')

[278, 3198, 3626, 7685, 0]

In [25]:
label_pipeline = lambda x: int(x)-1

In [26]:
# collate fn 

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence #[278, 3198, 3626, 11974, 0]
                                            #[4, 2, 1, <pad>, <pad>]

pad_dix = vocab['<pad>']

In [27]:
pad_idx = vocab['<pad>']


def collate_batch(batch):

    label_list, text_list, lenght_list = [], [], []

    for (label, text) in batch:

        label_list.append(label_pipeline(label))

        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)

        lenght_list.append(processed_text.size(0)) # needed for padded sequence

    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True), torch.tensor(lenght_list, dtype=torch.int64)
    # batch will be always first ==> batch = True

In [28]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(valid, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(test, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [29]:
for label, text, length in train_loader:
    print(len(label))
    print(len(text))
    print(len(length))
    break

64
64
64


In [30]:
print(label.shape)  #batch_size, 
print(text.shape)   #batch_size, seq_len
print(length.shape) #batch_size,

torch.Size([64])
torch.Size([64, 111])
torch.Size([64])


### 6. Model(LSTM + CNN)

In [31]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers,
                        bidirectional, dropout, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_dix)
        self.lstm      = nn.LSTM(
                            emb_dim,
                            hid_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout=dropout,
                            batch_first=True
                        )   
        self.fc         = nn.Linear(hid_dim * 2, output_dim)


# output_dim = prediction head
    def forward(self, text, text_length): # label would be include in training
        # text = [batch_size, seq_len]
        embedded = self.embedding(text)
        print('embedded: ', embedded.shape)
        # text = [batch_size, seq_len, emb_dim]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'),
                                                            enforce_sorted = False, batch_first = True)
        
        # print('packed_embedded: ', packed_embedded)
        
        packed_output, (hn,cn) = self.lstm(packed_embedded)

        # print('packed_output: ', packed_output.shape)
        print('hn: ', hn.shape)
        print('cn: ', cn.shape)


        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)


        print('output: ', output.shape)
        print('output_length: ', output_lengths.shape)
        
        #output = [batch_size, seq len, hidden_dim * num directions]
        #hn     = [num_layers * num_directions, batch_size,  hid_dim]  #3 layers birectional - hn1f, hn1b, hn2f, hn2b, hn3f, hn3b
        #cn     = [num_layers * num_directions, batch_size,  hid_dim]
        
        hn      = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim = 1)
        #hn     = [batch_size, hidden_dim * num_directions]

        print('hn: ', hn.shape)
        
        return self.fc(hn)

In [32]:
import torch.nn as nn
import torch.nn.functional as F

filter_sizes = [3, 4, 5]

class CNN(nn.Module):

    def __init__(self, input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes):

        super(CNN, self).__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)

        self.conv_0    = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                                   kernel_size=(filter_sizes[0], emb_dim))

        self.conv_1    = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                                kernel_size=(filter_sizes[1], emb_dim))
        
        self.conv_2    = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                                kernel_size=(filter_sizes[2], emb_dim))
        
        self.fc        = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout   = nn.Dropout(dropout)

    def forward(self, text):
        # 1. embed

        #text = [batch_size, seq len]

        embedded = self.embedding(text) #embedded = [batch_size, seq_len, emb_dim]
        embedded = embedded.unsqueeze(1) #embedded = [batch_size, 1, seq_len, emb_dim]

        # 2. convolute
        conved_0  = F.relu(self.conv_0(embedded). squeeze(3))
        conved_1  = F.relu(self.conv_1(embedded). squeeze(3))
        conved_2  = F.relu(self.conv_2(embedded). squeeze(3))

        #before squeeze = conved0_n = [batch_size, n_filters, seq_len - filter_size[n] + 1, 1]
        #after squeeze = conved0_n = [batch_size, n_filters, seq_len - filter_size[n] + 1]

        # 3. maxpool
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)

        #before squeeze = [batch_size, n_filters, 1]
        #after  squeeze = [batch_size, n_filters]

        # 4. dropout and fc
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))

        return self.fc(cat)

In [33]:
class LSTMCNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, bidirectional, dropout, output_dim, n_filters, filter_sizes):
        super(LSTMCNN, self).__init__()
        self.lstm = LSTM(input_dim, emb_dim, hid_dim, num_layers, bidirectional, dropout, output_dim)
        self.cnn = CNN(input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes)

    def forward(self, text, text_length):
        lstm_output = self.lstm(text, text_length)
        return self.cnn(lstm_output)

### 7. Initializing

In [34]:
def initialize_weight(m): # randomly initialize weights and may reach convergence much faster

    if isinstance(m, nn.Linear):

        nn.init.xavier_normal_(m.weight) # try to make all the weights within normal distribution
        nn.init.zeros_(m.bias)           

    elif isinstance(m, nn.LSTM):

        for name, param in m.named_parameters():

            if 'bias' in name:
                nn.init.zeros_(param)

            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [35]:
input_dim  = len(vocab)
hid_dim    = 256
emb_dim    = 300 #fasttext
output_dim = 4 #four types of news
num_layers = 2
bidirectional = True
dropout    = 0.5
n_filters  = 100
filter_sizes = [3, 4, 5]

model = LSTMCNN(input_dim, emb_dim, hid_dim, num_layers, bidirectional, dropout, output_dim, n_filters, filter_sizes)

model.apply(initialize_weight)
model.embedding.weight.data = fast_embeddings

model.to(device)

AttributeError: 'LSTMCNN' object has no attribute 'embedding'

In [36]:
def count_parameters(model):
    params = [p.numel() for p in model.parameters() if p.requires_grad]
    print(sum(params))    

In [37]:
count_parameters(model)

34694900


In [38]:
import torch.optim as optim

lr        = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [39]:
def accuracy(preds, y):
    predicted  = torch.max(preds.data, 1)[1] 
    batch_corr = (predicted == y).sum() # batch_correctness
    acc        = batch_corr / len(y)
    return acc

In [40]:
def train(model, loader, optimizer, criterion, loader_length):

    epoch_loss  = 0
    epoch_acc   = 0
    model.train()

    for i, (label, text, text_length) in enumerate(loader):

        label = label.to(device)
        text  = text.to(device)

        predictions = model(text, text_length).squeeze(1)
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)

        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc  += acc.item()

    return epoch_loss / loader_length, epoch_acc / loader_length

In [41]:
def evaluate(model, loader, criterion, loader_length):

    epoch_loss  = 0
    epoch_acc   = 0
    model.eval()

    with torch.no_grad():

        for i, (label, text, text_length) in enumerate(loader):

            label = label.to(device)
            text  = text.to(device)

            predictions = model(text, text_length).squeeze(1)
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc  += acc.item()

    return epoch_loss / loader_length, epoch_acc / loader_length

### Actual training

In [42]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))



In [43]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [44]:
import time

best_valid_loss = float('inf')
num_epochs      = 10

save_path       = f'models/{model.__class__.__name__}.pt'

train_losses, train_accs, val_losses, val_accs = [],[],[],[]

for epoch in range(num_epochs):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
    valid_loss, valid_acc = train(model, val_loader,   optimizer, criterion, val_loader_length)
    
    #for plotting
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(valid_loss)
    val_accs.append(valid_acc)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #early stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), save_path)
    
    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal.  Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')
        

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)