In [0]:
d=[]
while(1):
  d.append('1')

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# path to the embeddings file
embed_path = '/content/drive/My Drive/cc.ru.300.bin'

# Word embeddings

## Fasttext

In [0]:
!pip install fasttext

import fasttext.util
import fasttext

ft = fasttext.load_model(embed_path)

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/10/61/2e01f1397ec533756c1d893c22d9d5ed3fce3a6e4af1976e0d86bb13ea97/fasttext-0.9.1.tar.gz (57kB)
[K     |█████▊                          | 10kB 26.9MB/s eta 0:00:01[K     |███████████▍                    | 20kB 1.5MB/s eta 0:00:01[K     |█████████████████               | 30kB 1.7MB/s eta 0:00:01[K     |██████████████████████▊         | 40kB 2.0MB/s eta 0:00:01[K     |████████████████████████████▍   | 51kB 1.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.7MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.1-cp36-cp36m-linux_x86_64.whl size=2385945 sha256=c855ad0aa29cacb6486e80354f10cd74cd8e8b9799ffb41816befb1083fb5cd9
  Stored in directory: /root/.cache/pip/wheels/9f/f0/04/caa82c912aee89ce76358ff954f3f0729b7577c8ff23a292e3
Successfully built fasttext
Installing c




# Model

In [0]:
import os
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import shutil

from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

In [0]:
use_cuda = True
if use_cuda and torch.cuda.is_available():
    torch.device('cuda')

In [0]:
# constants
EMBED_DIM = 300
BATCH_SIZE = 512
EPOCHS = 10
PAD_WORD = '`'
PAD_TAG = -1

In [0]:
def save_model(state, is_best, checkpoint_dir, best_model_dir):
    f_path = 'models/' + checkpoint_dir + '_model.pt'
    if not os.path.exists('models'):
        os.makedirs('models')

    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_dir + '/best_model.pt'
        if not os.path.exists(best_model_dir):
            os.makedirs(best_model_dir)

        shutil.copyfile(f_path, best_fpath)


def load_model(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return model, optimizer, checkpoint['epoch']

In [0]:
class Corpus(Dataset):
    def __init__(self, word_to_ix, tag_to_ix, data):
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, tags = self.data[idx]
        return ' '.join(sentence), ' '.join(tags)

In [0]:
class Event_tagger(nn.Module):
    def __init__(self,
                 word_to_ix,
                 tag_to_ix,
                 ix_to_tag,
                 weights_matrix,
                 batch_size,
                 word_embed_dim=300,
                 lstm_num_layers=2,
                 bidirectional=True,
                 dropout=.5):
        '''
            initialize models
            batch_size      - size of batches for traininig
            word_embed_dim  - dimension of word embeddings
            lstm_num_layers - number of LSTM layers
            dropout         - rate for dropout layer
        '''
        super(Event_tagger, self).__init__()

        # dictionaries
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.ix_to_tag = ix_to_tag

        # parameters
        self.batch_size = batch_size
        self.word_embed_dim = word_embed_dim
        self.lstm_hidden_dim = self.word_embed_dim
        self.lstm_num_layers = lstm_num_layers
        self.dropout = dropout

        self.num_directions = 2 if bidirectional else 1

        self.word_embeds, num_embeds = create_embed_layer(weights_matrix, True)
        self.lstm_hidden_embeds = self.init_hidden_embeddings(self.batch_size)

        self.lstm = nn.LSTM(self.word_embed_dim,
                            self.lstm_hidden_dim,
                            dropout=self.dropout,
                            num_layers=self.lstm_num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.dense = nn.Linear(self.lstm_hidden_dim * self.num_directions,
                               len(self.tag_to_ix))

    def init_hidden_embeddings(self, batch_size):
        '''
            initialize hidden embeddings as zeros
        '''
        return (torch.zeros(self.lstm_num_layers * self.num_directions,
                            batch_size, self.lstm_hidden_dim),
                torch.zeros(self.lstm_num_layers * self.num_directions,
                            batch_size, self.lstm_hidden_dim))
        
    def forward(self, words_batch):
        '''
            words_batch - contains indices of words in current batch with shape
                        (batch_size, num_words_in_sentence)

            creates word level word embeddings from words_batch

            runs Bi-directional LSTM over input word representation to get
            final word representation which is fed to linear layer and softmax
            activation function to generate probability distribution for event
            tag set
        '''
        # create word-level word embeddings
        word_embed_word_level = words_batch.view(-1)
        word_embed_word_level = self.word_embeds(word_embed_word_level)
        batch_sent_embed = word_embed_word_level.view(words_batch.shape[0], -1,
                                                 word_embed_word_level.shape[-1])

        # create final word representation from LSTM
        lstm_out, self.lstm_hidden_embeds = self.lstm(batch_sent_embed,
                                                      self.lstm_hidden_embeds)

        # get probabilities for event tag
        tag_space = self.dense(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=2)
        return tag_space

In [0]:
def create_vocabulary(df):
    '''
        creates vocabulary from dataset
    '''
    all_words = list(set(list(df.word)))
    word_to_ix = {all_words[ix]:ix for ix in range(len(all_words))}

    all_tags = list(set(list(df.type)))
    tag_to_ix = {all_tags[ix]:ix for ix in range(len(all_tags))}
    ix_to_tag = dict([(value, key) for key, value in tag_to_ix.items()])

    word_to_ix[PAD_WORD] = len(word_to_ix)
    data = create_data(df)

    return word_to_ix, (tag_to_ix, ix_to_tag), data

In [0]:
def create_data(df):
    sentences = []

    doc_ids = list(set(df.doc))
    for doc_id in doc_ids:
        sent_ids = list(set(df[df.doc == doc_id].sentence))
        df_docs = df[df.doc == doc_id]
        for sent_id in sent_ids:
            df_sents = df_docs[df_docs.sentence == sent_id]

            words = [word for word in df_sents.word]
            tags = [tag for tag in df_sents.type]

            sentences.append((words, tags))
    return sentences

In [0]:
def create_embed_weights(target_vocab, ft):
    '''
        For each word in dataset’s vocabulary, we check if it is vocabulary.
        If it is, we load its pre-trained word vector.
        Otherwise, we initialize a random vector.
    '''
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, EMBED_DIM))
    words_found = 0

    for word, ix in target_vocab.items():
        try:
            weights_matrix[ix] = ft.get_word_vector(word)
            words_found += 1
        except KeyError:
            weights_matrix[ix] = np.random.normal(scale=0.6, size=(EMBED_DIM, ))

    return torch.as_tensor(weights_matrix)

In [0]:
def create_embed_layer(weights_matrix, non_trainable=True):
    '''
        custom embedding layer
    '''
    num_embeds, embed_dim = weights_matrix.size()
    embed_layer = nn.Embedding(num_embeds, embed_dim)
    embed_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        embed_layer.weight.requires_grad = False

    return embed_layer, num_embeds

In [0]:
def get_indices(sentence, word_to_ix):
    '''
        retrieves indices of sentence from word vocabulary
    '''
    word_indices = []
    for token in sentence:
        word_indices.append(word_to_ix[token])
    return word_indices


def get_batch(batch, word_to_ix, tag_to_ix):
    '''
        creates batches of indices for words and tags
        finds maximum length of sentence for padding
    '''
    words_batch, tags_batch = [], []
    max_sent_len = 0

    sentences_batch, tag_batch = batch
    for i in range(len(sentences_batch)):
        sentence = sentences_batch[i].split(' ')
        tags = tag_batch[i].split()

        word_indices = get_indices(sentence, word_to_ix)
        tag_indices = [tag_to_ix[tag] for tag in tags]

        words_batch.append(word_indices)
        tags_batch.append(tag_indices)

        max_sent_len = max(max_sent_len, len(sentence))

    return (words_batch, tags_batch), max_sent_len

In [0]:
def pad_indices(indices, word_to_ix, length):
    '''
        performs padding of word and tag batches

        indices:
                indices[0] - word_indices
                indices[1] - tag_indices
    '''
    for i in range(len(indices[0])):
        for j in range(len(indices[0][i]), length):
            indices[0][i].append(word_to_ix[PAD_WORD])
            indices[1][i].append(PAD_TAG)
        indices[0][i] = torch.tensor(indices[0][i], dtype=torch.long)
        indices[1][i] = torch.tensor(indices[1][i], dtype=torch.long)

    return (torch.stack(indices[0]), torch.stack(indices[1]))

In [0]:
def train_model(df, ft, doc_id=0, continue_training=False):
    start = datetime.now()
    # reads vocabulary and creates corpus
    # :tag_dics: - (tag_to_ix, ix_to_tag)
    # :data: - [([token, token, ...], [id, id, ...])]
    word_to_ix, tag_dicts, data = create_vocabulary(df)    

    # initialize wight embeddings from fasttext
    print('create weight matrix')
    weights_matrix = create_embed_weights(word_to_ix, ft)
    # initialize model
    # layers - Embeddings -> Bi-LSTM -> Dense (softmax)
    model = Event_tagger(word_to_ix, tag_dicts[0], tag_dicts[1],
                         weights_matrix, BATCH_SIZE)

    # loss function and optimizer
    loss_function = nn.CrossEntropyLoss(ignore_index=PAD_TAG)
    optimizer = optim.Adam(model.parameters())

    if continue_training:
        model_path = '/content/drive/My Drive/' + str(doc_id - 1) + '_model-2.pt'
        model, optimizer, start_epoch = load_model(model_path, model, optimizer)

    
    print('start...')
    # helping function to crate DataLoader for creating batches
    corpus = Corpus(word_to_ix, tag_dicts[0], data)
    # create DataLoader object and train model
    dataloader = DataLoader(dataset=corpus,
                            batch_size=BATCH_SIZE,
                            shuffle=False,
                            num_workers=8)
    for epoch in range(EPOCHS):
        for batch_index, batch in enumerate(dataloader):
            # Clear gradients and hidden layer
            model.zero_grad()
            model.lstm_hidden_embeds = model.init_hidden_embeddings(
                len(batch[0]))

            # batch_indices - (words_batch, tags_batch)
            batch_indices, length = get_batch(batch, word_to_ix, tag_dicts[0])
            batch_indices = pad_indices(batch_indices, word_to_ix, length)

            tag_scores_batch = model(batch_indices[0])
            target = batch_indices[1].view(-1)
            output = tag_scores_batch.view(-1, tag_scores_batch.size()[-1])

            loss = loss_function(output, target)
            loss.backward()
            optimizer.step()

            # printing part
            if batch_index == 0 or (batch_index + 1) % 10 == 0 or batch_index + 1 == math.ceil(
                    len(data) / BATCH_SIZE):
                predicted_tags = []
                y_true = []
                for i in range(output.size()[0]):
                    if target[i].item() == PAD_TAG:
                        continue
                    # calculate accuracy every 1000 batches
                    # and on the last batch
                    _, predicted_ix = torch.max(output[i], 0)
                    predicted_tags.append(predicted_ix.item())
                    y_true.append(target[i].item())
                score = f1_score(y_true, predicted_tags, average='weighted')
                print(
                    "Epoch {}/{} | Batch {}/{} | Loss {:.3f} | F1 score {:.3f}"
                    .format(epoch + 1, EPOCHS, batch_index + 1,
                            math.ceil(len(data) / BATCH_SIZE),
                            loss.data.item(), score))
    checkpoint = {
        'epoch': doc_id * epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    save_model(checkpoint, False, str(doc_id), 'best_model')
    print('model saved')
    doc_id += 1
    end = datetime.now()
    print('Finished... {}'.format(end - start))

    return model, doc_id, word_to_ix, tag_dicts[0], tag_dicts[1]

In [0]:
df = pd.read_csv('/content/drive/My Drive/output.csv')

# Test

In [0]:
def test_train_split(df, ratio=.2):
    max_doc = df.iloc[-1].doc
    # :ratio: - percentage of data left to testing
    split_doc = max_doc * (1 - ratio)

    df_train = df[df.doc <= split_doc]
    df_test = df[df.doc > split_doc]

    return df_train, df_test

In [0]:
def test_sentences(df, word_to_ix, tag_to_ix, ix_to_tag, doc_id):
    batch_size = 1

    print('create weight matrix')
    weights_matrix = create_embed_weights(word_to_ix, ft)
    model = Event_tagger(word_to_ix, tag_to_ix, ix_to_tag,
                         weights_matrix, BATCH_SIZE)

    # loss function and optimizer
    # loss_function = nn.CrossEntropyLoss(ignore_index=PAD_TAG)
    optimizer = optim.Adam(model.parameters())

    # load model
    model_path = 'models/' + str(doc_id - 2) + '_model.pt'
    model, optimizer, start_epoch = load_model(model_path, model, optimizer)
    model.lstm_hidden_embeds = model.init_hidden_embeddings(batch_size)
    model.lstm.flatten_parameters()

    # load test dataset
    data = create_data(df)

    while doc_id < max(data.keys()):
        word_indices = sentences_to_indices(data[doc_id], word_to_ix)

        # run test dataset through model
        predicted_tags = []
        y_true = []
        for i in range(len(word_indices)):
            tag_scores = model(torch.stack([word_indices[i]]))
            out_probs = torch.squeeze(tag_scores)
            predicted_tags = []
            for pset in out_probs:
                _, predicted_ix = torch.max(pset, 0)
                predicted_tags.append(predicted_ix.item())
            target = [tag_to_ix[tag] for tag in data[doc_id][i][1]]
            y_true.extend(target)
        score = f1_score(y_true, predicted_tags)
        print("Testing... Doc id {}/{} | F1 score {:.3f}".format(doc_id,
                                                                 max(data.keys()),
                                                                 score))

# Check

In [0]:
def sentences_to_indices(sentences, word_to_ix):
    '''
        sentences - [[token, ...], [token, ...], ...]
        converts sentences to word indices
    '''
    all_word_indices = []
    for sent in sentences:
        word_indices = get_indices_train(sent[0], word_to_ix)
        all_word_indices.append(word_indices)
    return all_word_indices

In [0]:
def get_indices_train(sentence, word_to_ix):
    '''
        retrieves indices of sentence from helpping dictionaries,
        maximum length of word in sentence
    '''
    word_indices = []
    for token in sentence:
        # read word index from dictionary
        # if word is not in the dictionary treat as unknown word
        tok = token if token in word_to_ix else PAD_WORD
        word_indices.append(word_to_ix[tok])

    return torch.tensor(word_indices, dtype=torch.long)

In [0]:
def add_tags_to_tokens(tokens, tags):
    '''
        tokens - [token, token, ...]
        tags   - [tag, tag, ...]

        concatenate tokens and tags into token/tag format as was in
        original corpus
    '''
    tokens = [tokens[0][i] + '/' + tags[i] + '/' + tokens[1][i] for i in range(len(tokens[0]))]
    return ' '.join(tokens)

In [0]:
def save_answer(out_file, output):
    '''
        output - [string, string, ...]
        writes output to the file
    '''
    with open(out_file, 'w') as f:
        for item in output:
            f.write("%s\n" % item)

In [0]:
def tag_sentence(df, word_to_ix, tag_to_ix, ix_to_tag, doc_id):
    batch_size = 1

    print('create weight matrix')
    weights_matrix = create_embed_weights(word_to_ix, ft)
    model = Event_tagger(word_to_ix, tag_to_ix, ix_to_tag,
                         weights_matrix, BATCH_SIZE)

    # loss function and optimizer
    # loss_function = nn.CrossEntropyLoss(ignore_index=PAD_TAG)
    optimizer = optim.Adam(model.parameters())

    # load model
    model_path = str(doc_id) + '/model.pt'
    model, optimizer, start_epoch = load_model(model_path, model, optimizer)
    model.lstm_hidden_embeds = model.init_hidden_embeddings(batch_size)
    model.lstm.flatten_parameters()

    # load test dataset
    data = create_data(df)

    word_indices = sentences_to_indices(data[doc_id], word_to_ix)

    # run test dataset through model
    output = []
    for i in range(len(word_indices)):
        tag_scores = model(torch.stack([word_indices[i]]))
        out_probs = torch.squeeze(tag_scores)
        predicted_tags = []
        for pset in out_probs:
            _, predicted_ix = torch.max(pset, 0)
            predicted_tags.append(ix_to_tag[predicted_ix.item()])
        output.append(add_tags_to_tokens(data[doc_id][i], predicted_tags))

    # Save answer
    out_file = str(doc_id) + '.txt'
    save_answer(out_file, output)

# Main

In [0]:
df_train, df_test = test_train_split(df)

model, doc_id, word_to_ix, tag_to_ix, ix_to_tag = train_model(df_train, ft)
# test_sentences(df_test, word_to_ix, tag_to_ix, ix_to_tag, doc_id + 1)

# print(doc_id)
# tag_sentence(df, word_to_ix, tag_to_ix, ix_to_tag, doc_id)

create weight matrix
start...
Epoch 1/50 | Batch 1/32 | Loss 3.996 | F1 score 0.000
Epoch 1/50 | Batch 10/32 | Loss 1.677 | F1 score 0.541
Epoch 1/50 | Batch 20/32 | Loss 1.269 | F1 score 0.611
Epoch 1/50 | Batch 30/32 | Loss 1.595 | F1 score 0.514
Epoch 1/50 | Batch 32/32 | Loss 1.301 | F1 score 0.622
Epoch 2/50 | Batch 1/32 | Loss 1.596 | F1 score 0.516
Epoch 2/50 | Batch 10/32 | Loss 1.425 | F1 score 0.541
Epoch 2/50 | Batch 20/32 | Loss 1.205 | F1 score 0.611
Epoch 2/50 | Batch 30/32 | Loss 1.474 | F1 score 0.518
Epoch 2/50 | Batch 32/32 | Loss 1.183 | F1 score 0.635
Epoch 3/50 | Batch 1/32 | Loss 1.451 | F1 score 0.523
Epoch 3/50 | Batch 10/32 | Loss 1.255 | F1 score 0.561
Epoch 3/50 | Batch 20/32 | Loss 1.020 | F1 score 0.650
Epoch 3/50 | Batch 30/32 | Loss 1.278 | F1 score 0.574
Epoch 3/50 | Batch 32/32 | Loss 0.974 | F1 score 0.675
Epoch 4/50 | Batch 1/32 | Loss 1.249 | F1 score 0.575
Epoch 4/50 | Batch 10/32 | Loss 1.038 | F1 score 0.638
Epoch 4/50 | Batch 20/32 | Loss 0.861 |

In [0]:
model.ix_to_tag = ix_to_tag
torch.save(model, 'model.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
doc_id

1

In [0]:
df_test.doc

303705     889
303706     889
303707     889
303708     889
303709     889
          ... 
370740    1110
370741    1110
370742    1110
370743    1110
370744    1110
Name: doc, Length: 67040, dtype: int64

In [0]:
import os, shutil
folder = 'models'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    if filename != '541_model.pt':
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

# Some stuff

In [0]:
'''
    creating word embeddings for each word in a sentence
    
    word_embeddings - dictionary {doc_id: [sentence_num - [word_num, ...], ...]}
'''

word_embeddings = {}
for doc in index:
    for sentence in index[doc]:
        
        if doc not in word_embeddings:
            word_embeddings[doc] = []
            
        word_embeddings[doc].append([ft.get_word_vector(word) for word in sentence])

NameError: ignored