In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

# load dataset
!cp /content/gdrive/My\ Drive/abbyy-nlp/train.jsonl /content
!cp /content/gdrive/My\ Drive/abbyy-nlp/dev.jsonl /content

!git clone https://github.com/Muhamob/BiDAF-pytorch.git
%cd BiDAF-pytorch
!git fetch
!git checkout boolq
!pip install -r requirements.txt

import nltk
nltk.download('punkt')

!mkdir .data/boolq
!mv /content/*.jsonl .data/boolq
!ls -la .data/boolq

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Cloning into 'BiDAF-pytorch'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 146 (delta 30), reused 31 (delta 13), pack-reused 91[K
Receiving objects: 100% (146/146), 8.67 MiB | 16.90 MiB/s, done.
Resolving deltas: 100% (71/71), done.
/content/BiDAF-pytorch
Branch 'boolq' set up to track remote branch 'boolq' from 'origin'.
Switched to a new branch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


total 8572
drwxr-xr-x 2 root root    4096 May 11 14:31 .
drwxr-xr-x 4 root root    4096 May 11 14:31 ..
-rw------- 1 root root 2238726 May 11 14:29 dev.jsonl
-rw------- 1 root root 6525813 May 11 14:29 train.jsonl


# Dataset

In [2]:
!head .data/boolq/dev.jsonl -n 1

{"question": "does ethanol take more energy make that produces", "title": "Ethanol fuel", "answer": false, "passage": "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradicto

In [None]:
import json
import os
import nltk
import torch

from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe


def word_tokenize(tokens):
    return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]


class BoolQ():
    def __init__(self, 
                 word_dim: int, 
                 train_batch_size: int, 
                 dev_batch_size: int, 
                 context_threshold: int = 0,
                 gpu: int = 0):
        path = '.data/boolq'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True)
        self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False)

        dict_fields = {'answer': ('answer', self.LABEL),
                       'passage': [('c_word', self.WORD), ('c_char', self.CHAR)],
                       'question': [('q_word', self.WORD), ('q_char', self.CHAR)]}

        list_fields = [('answer', self.LABEL),
                       ('c_word', self.WORD), ('c_char', self.CHAR),
                       ('q_word', self.WORD), ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples, fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train='train.jsonl',
                validation='dev.jsonl',
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if context_threshold > 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) <= context_threshold]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=word_dim))

        print("building iterators...")
        device = torch.device("cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")
        print(device)
        self.train_iter = data.BucketIterator(
            self.train,
            batch_size=train_batch_size,
            device=device,
            repeat=True,
            shuffle=True,
            sort_key=lambda x: len(x.c_word)
        )

        self.dev_iter = data.BucketIterator(
            self.dev,
            batch_size=dev_batch_size,
            device=device,
            repeat=False,
            sort_key=lambda x: len(x.c_word)
        )

    def preprocess_file(self, path):
        pass

In [None]:
!git pull
!rm -rf runs/
from time import gmtime, strftime
from importlib import reload 
import run
reload(run)

dataset = BoolQ(word_dim=100, train_batch_size=32, dev_batch_size=32)

config = {
    'char_dim': 8,
    'char_channel_width': 5,
    'char_channel_size': 100,
    'context_threshold': 0,
    'dev_batch_size': 100,
    'dropout': 0.1,
    'epoch': 25,
    'exp_decay_rate': 0.999,
    'gpu': 0,
    'hidden_size': 100,
    'learning_rate': 0.5,
    'grad_clipping': 3,
    'weight_decay': 0,
    'print_freq': 250,
    'train_batch_size': 32,
    'word_dim': 100,
    'char_vocab_size': len(dataset.CHAR.vocab),
    'word_vocab_size': len(dataset.WORD.vocab),
    'model_time': strftime('%H:%M:%S', gmtime())
}

class Config:
    def __init__(self, d):
        for key, value in d.items():
            setattr(self, key, value)

args = Config(config)
run.train(args, dataset)

Already up to date.
loading splits...
building vocab...
building iterators...
cuda:0
epoch: 1
Количество ответов с меткой True 3270
train loss: 168.161 / dev loss: 68.923 /  dev accuracy: 0.622
epoch: 2
Количество ответов с меткой True 3270
train loss: 166.135 / dev loss: 68.770 /  dev accuracy: 0.622
epoch: 3
Количество ответов с меткой True 3270
train loss: 166.089 / dev loss: 68.670 /  dev accuracy: 0.622
epoch: 4
Количество ответов с меткой True 3270
train loss: 163.489 / dev loss: 68.286 /  dev accuracy: 0.622
epoch: 5
Количество ответов с меткой True 3270
train loss: 160.253 / dev loss: 67.914 /  dev accuracy: 0.622
epoch: 6
Количество ответов с меткой True 3270
train loss: 157.381 / dev loss: 67.420 /  dev accuracy: 0.622
Количество ответов с меткой True 3265
train loss: 155.485 / dev loss: 66.742 /  dev accuracy: 0.621
epoch: 7
Количество ответов с меткой True 3222
train loss: 150.048 / dev loss: 66.381 /  dev accuracy: 0.626
epoch: 8
Количество ответов с меткой True 3062
train