In [None]:
import torch
from torchtext import data
from torchtext import datasets
import torchtext
import random
import torch.optim as optim
import time
import torch.nn as nn

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
!cp drive/'My Drive'/EmotionRecognition/train_text.csv .
!cp drive/'My Drive'/EmotionRecognition/test_text.csv .

In [10]:
!pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
!pip install pymorphy2==0.8

Collecting git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
  Cloning https://github.com/aatimofeev/spacy_russian_tokenizer.git to /tmp/pip-req-build-hji4ao6v
  Running command git clone -q https://github.com/aatimofeev/spacy_russian_tokenizer.git /tmp/pip-req-build-hji4ao6v
Building wheels for collected packages: spacy-russian-tokenizer
  Building wheel for spacy-russian-tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for spacy-russian-tokenizer: filename=spacy_russian_tokenizer-0.1.1-cp36-none-any.whl size=12675 sha256=350dec57d0c0bcbabab664bb4a3e17ffd59ea1d7d9e91dc69afbb62881dee5e3
  Stored in directory: /tmp/pip-ephem-wheel-cache-wvt50pac/wheels/37/3b/bb/cfe712f7c0b78cd08f4a2ef122d17748baf9d4bebecf2e5a54
Successfully built spacy-russian-tokenizer
Collecting pymorphy2==0.8
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |████████████

In [None]:
from spacy.lang.ru import Russian
from spacy_russian_tokenizer import RussianTokenizer, MERGE_PATTERNS
nlp = Russian()
russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
def tokenize_ru(sentence):
    return [tok.text for tok in nlp(sentence)]

In [None]:
FILE = data.Field()
TEXT = data.Field(tokenize = tokenize_ru, include_lengths = True)
LABEL = data.LabelField(dtype = torch.long)

In [None]:
fields = [('file', FILE), ('text', TEXT), ('label', LABEL)]

train_data, test_data = data.TabularDataset.splits(
                                        path = './',
                                        train = 'train_text.csv',
                                        # validation = 'valid.csv',
                                        test = 'test_text.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [None]:

# print(LABEL.vocab.stoi)

In [None]:
# train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
# train_data, test_data = train_data.split(random_state = random.seed(SEED))

In [186]:
print(f'Number of training examples: {len(train_data)}')
# print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 7158
Number of testing examples: 3069


In [None]:
import os
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec'
name = os.path.basename(url)
vec = torchtext.vocab.Vectors(name, url=url)

In [188]:
MAX_VOCAB_SIZE = 25_000

# vec = torchtext.vocab.FastText(language='en')

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = vec, 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)
FILE.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 3614
Unique tokens in LABEL vocabulary: 3


In [None]:
# len(LABEL.vocab)

In [190]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    shuffle = True,
    sort = False,
    device = device)

cuda


In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [193]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([3614, 300])


In [194]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0438,  0.1425,  0.1500,  ...,  0.0027, -0.0461,  0.0745],
        [ 0.5091, -0.2560, -0.6233,  ...,  0.0328, -0.0662,  0.0265],
        [ 0.3036, -0.6419, -0.7598,  ...,  0.2153, -0.3162,  0.2115]])

In [195]:

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0438,  0.1425,  0.1500,  ...,  0.0027, -0.0461,  0.0745],
        [ 0.5091, -0.2560, -0.6233,  ...,  0.0328, -0.0662,  0.0265],
        [ 0.3036, -0.6419, -0.7598,  ...,  0.2153, -0.3162,  0.2115]])


In [None]:
# def binary_accuracy(preds, y):
#     """
#     Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
#     """

#     #round predictions to the closest integer
#     rounded_preds = torch.round(torch.sigmoid(preds))
#     correct = (rounded_preds == y).float() #convert into float for division 
#     acc = correct.sum() / len(correct)
#     return acc

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') StepLR(optimizer, step_size=2, gamma=0.1)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)

In [None]:
N_EPOCHS = 500

best_valid_loss = float('inf')
train_losses = []
test_losses = []
train_acces = []
test_acces = []

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    train_losses.append(train_loss)
    test_losses.append(valid_loss)
    train_acces.append(train_acc)
    test_acces.append(valid_acc)
    scheduler.step(train_loss)

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'text-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(f'LR:', scheduler.get_last_lr())



Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.516 | Train Acc: 74.43%
	 Val. Loss: 1.677 |  Val. Acc: 58.80%
LR: [0.001]
Epoch: 02 | Epoch Time: 0m 4s
	Train Loss: 0.515 | Train Acc: 74.41%
	 Val. Loss: 1.810 |  Val. Acc: 58.88%
LR: [0.001]
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.516 | Train Acc: 74.45%
	 Val. Loss: 1.786 |  Val. Acc: 58.12%
LR: [0.001]
Epoch: 04 | Epoch Time: 0m 4s
	Train Loss: 0.516 | Train Acc: 74.41%
	 Val. Loss: 1.740 |  Val. Acc: 59.11%
LR: [0.001]
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.519 | Train Acc: 74.26%
	 Val. Loss: 1.763 |  Val. Acc: 59.20%
LR: [0.001]
Epoch: 06 | Epoch Time: 0m 4s
	Train Loss: 0.519 | Train Acc: 74.25%
	 Val. Loss: 1.633 |  Val. Acc: 58.93%
LR: [0.001]
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 0.517 | Train Acc: 74.14%
	 Val. Loss: 1.649 |  Val. Acc: 58.54%
LR: [0.001]
Epoch: 08 | Epoch Time: 0m 4s
	Train Loss: 0.514 | Train Acc: 74.40%
	 Val. Loss: 1.676 |  Val. Acc: 58.96%
LR: [0.001]
Epoch: 09 | Epoch Time: 0m 4s
	Train Los

In [None]:
!cp text-model.pt drive/'My Drive'/EmotionRecognition/

In [None]:
model.load_state_dict(torch.load('text-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')