In [1]:

!pip install transformers

[0m

In [5]:

# coding=utf-8

"""BERT finetuning runner."""

from __future__ import absolute_import, division, print_function

import argparse
from collections import OrderedDict
import csv
import logging
import os
import random
import sys
import pandas as pd

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset) 
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss, MSELoss
import sys
# /content/GlossBERT-master/modeling.py
sys.path.append("../input/gloss-bert/") 
from optimization import BertAdam, warmup_linear
# from file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from modeling import BertForSequenceClassification, BertConfig
sys.path.append("../input/tokenisation") 
from tokenization import BertTokenizer


from transformers import BertForSequenceClassification
from transformers import AutoTokenizer

WEIGHTS_NAME = "pytorch_model.bin"
CONFIG_NAME = "config.json"
# PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',Path.home() / '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__)


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    
    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the test set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
    
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines

class WSD_sent_Processor(DataProcessor):
    """Processor for the WSD data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        train_data = pd.read_csv(data_dir, sep="\t", na_filter=False).values
        return self._create_examples(train_data, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        dev_data = pd.read_csv(data_dir, sep="\t", na_filter=False).values
        return self._create_examples(dev_data, "dev")

    def get_labels(self):
        """See base class."""
        return ["0","1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = str(line[2])
            text_b = str(line[3])
            label = str(line[1])
            if i%1000==0:
                print(i)
                print("guid=",guid)
                print("text_a=",text_a)
                print("text_b=",text_b)
                print("label=",label)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(tqdm(examples)):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [6]:
# !pip install transformers

In [12]:
!unzip /content/uncased_L-12_H-768_A-12.zip

Archive:  /content/uncased_L-12_H-768_A-12.zip

  End-of-central-directory signature not found.  Either this file is not

  a zipfile, or it constitutes one disk of a multi-part archive.  In the

  latter case the central directory and zipfile comment will be found on

  the last disk(s) of this archive.

unzip:  cannot find zipfile directory in one of /content/uncased_L-12_H-768_A-12.zip or

        /content/uncased_L-12_H-768_A-12.zip.zip, and cannot find /content/uncased_L-12_H-768_A-12.zip.ZIP, period.


In [7]:

device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
n_gpu = torch.cuda.device_count()

In [8]:

local_rank=-1
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO if local_rank in [-1, 0] else logging.WARN)

In [9]:

logger.info("device: {} n_gpu: {}".format(
    device, n_gpu))

In [10]:


do_train=True 
gradient_accumulation_steps=1
if gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                        gradient_accumulation_steps))

In [12]:
do_eval=True
do_test=True
seed=42
task_name='WSD'
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
train_data_dir="../input/train-data/semcor_train_sent_cls.csv"
# eval_data_dir="./Evaluation_Datasets/semeval2007/semeval2007_test_sent_cls_ws.csv"
output_dir='./results'
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)
    assert train_data_dir != None,"train_data_dir can not be None"

In [13]:

if not do_train and not do_test:
    raise ValueError("At least one of `do_train` or `do_test` must be True.")
if do_train:
    assert train_data_dir != None, "train_data_dir can not be None"
if do_eval:
    assert eval_data_dir != None, "eval_data_dir can not be None"

In [14]:


if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
    raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
os.makedirs(output_dir, exist_ok=True)


In [15]:

processors = {
    "WSD":WSD_sent_Processor
}

output_modes = {
    "WSD": "classification"
}


In [16]:

processor = processors[task_name]()
output_mode = output_modes[task_name]
label_list = processor.get_labels()
num_labels = len(label_list)


In [17]:


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 2515915.13B/s]


In [54]:

# training set
train_examples = None
num_train_optimization_steps = None
num_train_epochs=3
train_batch_size=32
train_data_dir='../input/train-data/semcor_train_sent_cls.csv'
if do_train:
    train_examples = processor.get_train_examples(train_data_dir)
    num_train_optimization_steps = int(
        len(train_examples) / train_batch_size /gradient_accumulation_steps) * num_train_epochs
    if local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

0
guid= train-0
text_a= How long has it been since you reviewed the objectives of your benefit and service program ?
text_b= desire strongly or persistently
label= 0
1000
guid= train-1000
text_a= Have you publicized the cents per hour value of the company 's share of insurance premiums ?
text_b= have, give, or receive a share of
label= 0
2000
guid= train-2000
text_a= Can staggered lunch periods relieve the capacity strain on your feeding facilities ?
text_b= take the midday meal
label= 0
3000
guid= train-3000
text_a= Do you insist that unneeded salary employees take their vacations during plant shutdowns ?
text_b= experience or feel or submit to
label= 0
4000
guid= train-4000
text_a= But even if that other plant employs the same number of workers and makes the same product , there are other facts to consider .
text_b= an actor situated in the audience whose acting is rehearsed but seems spontaneous to the audience
label= 0
5000
guid= train-5000
text_a= M + R Dietetic Laboratories , Inc

In [55]:
cache_dir='./cache'
cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(local_rank))
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
          cache_dir=cache_dir,
          num_labels=num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [56]:

# if fp16:
#     model.half()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [57]:

if local_rank != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [58]:

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [59]:


# if args.fp16:
#     try:
#         from apex.optimizers import FP16_Optimizer
#         from apex.optimizers import FusedAdam
#     except ImportError:
#         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

#     optimizer = FusedAdam(optimizer_grouped_parameters,
#                           lr=args.learning_rate,
#                           bias_correction=False,
#                           max_grad_norm=1.0)
#     if args.loss_scale == 0:
#         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
#     else:
#         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

# else:
learning_rate=5e-5
warmup_proportion=0.1
optimizer = BertAdam(optimizer_grouped_parameters,
                      lr=learning_rate,
                      warmup=warmup_proportion,
                      t_total=num_train_optimization_steps)

In [24]:

max_seq_length=128

In [60]:

len(train_examples)

2021762

In [65]:

max_seq_length=128 
train_features = convert_examples_to_features(
    train_examples[:10000], label_list, max_seq_length, tokenizer, "classification")

100%|██████████| 10000/10000 [00:05<00:00, 1971.02it/s]


In [66]:
train_examples=train_examples[:10000]
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

In [67]:

# if args.do_train:



if output_mode == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif output_mode == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

In [68]:
# eval_batch_size=8
# # if args.do_eval:
# eval_examples = processor.get_dev_examples(eval_data_dir)
# eval_features = convert_examples_to_features(
#     eval_examples[:1000], label_list, max_seq_length, tokenizer, output_mode)
# logger.info("***** Running evaluation *****")
# logger.info("  Num examples = %d", len(eval_examples))
# logger.info("  Batch size = %d", eval_batch_size)
# all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
# all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
# all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

# if output_mode == "classification":
#     all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
# elif output_mode == "regression":
#     all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

# eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# eval_dataloader = DataLoader(eval_data, batch_size=eval_batch_size, shuffle=False)

In [None]:

global_step = 0
nb_tr_steps = 0
tr_loss = 0
num_train_epochs=3
# if args.do_train:
model.train()

epoch = 0
for _ in tqdm(trange(int(num_train_epochs), desc="Epoch")):
  epoch += 1
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

      if output_mode == "classification":
          loss_fct = CrossEntropyLoss()
          loss = loss_fct(logits.logits.view(-1, num_labels), label_ids.view(-1))
      elif output_mode == "regression":
          loss_fct = MSELoss()
          loss = loss_fct(logits.view(-1), label_ids.view(-1))

      if n_gpu > 1:
          loss = loss.mean() # mean() to average on multi-gpu.
      if gradient_accumulation_steps > 1:
          loss = loss / gradient_accumulation_steps

      # if args.fp16:
      #     optimizer.backward(loss)
      # else:
      loss.backward()

      tr_loss += loss.item()
      nb_tr_examples += input_ids.size(0)
      nb_tr_steps += 1
      if (step + 1) % gradient_accumulation_steps == 0:
          
          # modify learning rate with special warm up BERT uses
          # if args.fp16 is False, BertAdam is used that handles this automatically
          lr_this_step = learning_rate * warmup_linear(global_step/num_train_optimization_steps, warmup_proportion)
          for param_group in optimizer.param_groups:
              param_group['lr'] = lr_this_step
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1
      

  # Save a trained model, configuration and tokenizer
  model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

  # If we save using the predefined names, we can load using `from_pretrained`
  model_output_dir = os.path.join(output_dir, str(epoch))
  if not os.path.exists(model_output_dir):
      os.makedirs(model_output_dir)
  output_model_file = os.path.join(model_output_dir, WEIGHTS_NAME)
  output_config_file = os.path.join(model_output_dir, CONFIG_NAME)

  torch.save(model_to_save.state_dict(), output_model_file) 
  model_to_save.config.to_json_file(output_config_file)
  tokenizer.save_vocabulary(model_output_dir)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
Epoch:  33%|███▎      | 1/3 [02:30<05:01, 150.94s/it]

In [47]:

# logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

In [37]:

class InputFeatures2(object):
    """A single set of features of data."""
    
    def __init__(self, input_ids, input_mask, segment_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids

In [38]:

def convert_to_features(candidate, tokenizer, max_seq_length=512):
    '''This function tokenises the data i.e converts the text into Model(bert) friendly tokens '''
    candidate_results = []
    features = []
    for item in candidate:
        text_a = item[0] # sentence
        text_b = item[1] # gloss
        candidate_results.append((item[-2], item[-1])) # (sense_key, gloss)

        #Tokenising with bert_tokeniser for better results
        tokens_a = tokenizer.tokenize(text_a)
        tokens_b = tokenizer.tokenize(text_b)
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        # Start and end of sentence tokens
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        # Checking if all the ids and masks match up to same length of 512 
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        # Appending into one list 
        features.append(
            InputFeatures2(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids))


    return features, candidate_results


In [42]:



def construct_context_gloss_pairs(input, target_start_id, target_end_id, lemma):
    """
    construct context gloss pairs like sent_cls_ws
    :param input: str, a sentence
    :param target_start_id: int
    :param target_end_id: int
    :param lemma: lemma of the target word
    :return: candidate lists
    """
    # making the input into the list of words
    sent = input.split(" ")
    # Checking if the target index and the ending index is valid 
    assert 0 <= target_start_id and target_start_id < target_end_id  and target_end_id <= len(sent)
    # Finding the word( target) using the indices given
    target = " ".join(sent[target_start_id:target_end_id])
    # Tagging the target sentence with '''' to create weak reference gloss pairs which have performed much superior to traditional models
    if len(sent) > target_end_id:
        sent = sent[:target_start_id] + ['"'] + sent[target_start_id:target_end_id] + ['"'] + sent[target_end_id:]
    else:
        sent = sent[:target_start_id] + ['"'] + sent[target_start_id:target_end_id] + ['"']
    # joining all the elements in the list to create one string 
    sent = " ".join(sent)
    # target word
    lemma = lemma

    # using wordnet gloss pairs where the sense has been tagged 
    sense_data = pd.read_csv("../input/sense-gloss/index.sense.gloss",sep="\t",header=None).values
    # creating and finding all the key words in the wordnet data 
    d = dict()
    for i in range(len(sense_data)):
        s = sense_data[i][0]
        # array(["'hood%1:15:00::", 8641944, 1, 0, '(slang) a neighborhood'],dtype=object)

        pos = s.find("%")
        try:
            d[s[:pos + 2]].append((sense_data[i][0],sense_data[i][-1]))
        except:
            d[s[:pos + 2]]=[(sense_data[i][0], sense_data[i][-1])]

    # print(len(d))
    # print(len(d["happy%3"]))
    # print(d["happy%3"])
    # Finding all the senses of our input word
    candidate = []
    for category in ["%1", "%2", "%3", "%4", "%5"]:
        query = lemma + category
        try:
            sents = d[query]
            for sense_key, gloss in sents:
                candidate.append((sent, f"{target} : {gloss}", target, lemma, sense_key, gloss))
        except:
            pass
    # If we dont have any sense in the wordnet dictionary
    assert len(candidate) != 0, f'there is no candidate sense of "{lemma}" in WordNet, please check'
    print(f'there are {len(candidate)} candidate senses of "{lemma}"')


    return candidate


In [43]:


def infer(input, target_start_id, target_end_id, lemma ):


  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  label_list = ["0", "1"]
  num_labels = len(label_list)
  # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True)
  # tokenizer=BertTokenizer
  # model = model
  # model.to(device)


  print(f"input: {input}\nlemma: {lemma}")
  examples = construct_context_gloss_pairs(input, target_start_id, target_end_id, lemma)
  eval_features, candidate_results = convert_to_features(examples, tokenizer)
  input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
  input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
  segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)



  model.eval()
  input_ids = input_ids.to(device)
  input_mask = input_mask.to(device)
  segment_ids = segment_ids.to(device)
  with torch.no_grad():
      logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

  logits_= F.softmax(list(logits.to_tuple())[0].to(torch.float64), dim=-1)
  logits_ = logits_.detach().cpu().numpy()
  output = np.argmax(logits_, axis=0)[1]
  print(f"results:\nsense_key: {candidate_results[output][0]}\ngloss: {candidate_results[output][1]}")

In [44]:


input = "bill wanted money to pay"
target_start_id = 2
target_end_id = 3
lemma = "money"

infer(input,target_start_id,target_end_id,lemma)

input: bill wanted money to pay
lemma: money
there are 3 candidate senses of "money"
results:
sense_key: money%1:21:01::
gloss: the official currency issued by a government or national bank


In [45]:

input = "He wanted to play soccer"
target_start_id = 3
target_end_id = 4
lemma = "play"

infer(input,target_start_id,target_end_id,lemma)

input: He wanted to play soccer
lemma: play
there are 52 candidate senses of "play"
results:
sense_key: play%2:33:02::
gloss: contend against an opponent in a sport, game, or battle


In [47]:

input = "He wanted to play a movie"
target_start_id = 3
target_end_id = 4
lemma = "play"

infer(input,target_start_id,target_end_id,lemma)

input: He wanted to play a movie
lemma: play
there are 52 candidate senses of "play"
results:
sense_key: play%1:04:00::
gloss: gay or light-hearted recreational activity for diversion or amusement


In [48]:
input = "He wanted to play guitar"
target_start_id = 3
target_end_id = 4
lemma = "play"

infer(input,target_start_id,target_end_id,lemma)

input: He wanted to play guitar
lemma: play
there are 52 candidate senses of "play"
results:
sense_key: play%2:36:05::
gloss: cause to emit recorded audio or video


In [51]:
import torch
torch.save(model, './model_f')

In [52]:
saved_model = torch.load('./model_f')

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [49]:

# model.eval()
# eval_loss, eval_accuracy = 0, 0
# nb_eval_steps, nb_eval_examples = 0, 0

# with open(os.path.join(args.output_dir, "results_"+str(epoch)+".txt"),"w") as f:
#     for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
#         input_ids = input_ids.to(device)
#         input_mask = input_mask.to(device)
#         segment_ids = segment_ids.to(device)
#         label_ids = label_ids.to(device)

#         with torch.no_grad():
#             logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

#         logits_ = F.softmax(logits, dim=-1)
#         logits_ = logits_.detach().cpu().numpy()
#         label_ids_ = label_ids.to('cpu').numpy()
#         outputs = np.argmax(logits_, axis=1)
#         for output_i in range(len(outputs)):
#             f.write(str(outputs[output_i]))
#             for ou in logits_[output_i]:
#                 f.write(" " + str(ou))
#             f.write("\n")
#         tmp_eval_accuracy = np.sum(outputs == label_ids_)

#         # create eval loss and other metric required by the task
#         if output_mode == "classification":
#             loss_fct = CrossEntropyLoss()
#             tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
#         elif output_mode == "regression":
#             loss_fct = MSELoss()
#             tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
        
#         eval_loss += tmp_eval_loss.mean().item()
#         eval_accuracy += tmp_eval_accuracy
#         nb_eval_examples += input_ids.size(0)
#         nb_eval_steps += 1

# eval_loss = eval_loss / nb_eval_steps
# eval_accuracy = eval_accuracy / nb_eval_examples
# loss = tr_loss/nb_tr_steps if args.do_train else None

# result = OrderedDict()
# result['eval_loss'] = eval_loss
# result['eval_accuracy'] = eval_accuracy
# result['global_step'] = global_step
# result['loss'] = loss

# output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
# with open(output_eval_file, "a+") as writer:
#     writer.write("epoch=%s\n"%str(epoch))
#     logger.info("***** Eval results *****")
#     for key in result.keys():
#         logger.info("  %s = %s", key, str(result[key]))
#         writer.write("%s = %s\n" % (key, str(result[key])))

In [50]:

# !unzip /content/GlossBERT-master.zip