In [27]:
from settings import model_classes

import pytorch_transformers
from transformers import BertTokenizer # Use the new BertTokenizer for batch_encode_plus!

from torch.utils.data import DataLoader
import os
import torchtext
from utils import dynamic_collate_fn, prepare_inputs, read_relations, read_rel_data, \
                    get_relation_embedding, prepare_rel_datasets, rel_encode, replicate_rel_data, \
                    get_relation_index, create_relation_clusters

In [2]:
pytorch_transformers.__version__

'1.2.0'

In [28]:
config_class, model_class, tokenizer_class = model_classes["bert"]
tokenizer = tokenizer_class.from_pretrained("bert-base-uncased")
tokenizer2 = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Load Dataset
data_dir = '/data/omler_data/LifelongFewRel'
relation_file = os.path.join(data_dir, 'relation_name.txt')
training_file = os.path.join(data_dir, 'training_data.txt')
validation_file = os.path.join(data_dir, 'val_data.txt')
relation_names = read_relations(relation_file) # List of relation names (converted to 1-based index later)
train_data = read_rel_data(training_file)
val_data = read_rel_data(validation_file)
print('Finished loading the dataset')
# Load GloVe vectors
print('Loading GloVe vectors')
glove = torchtext.vocab.GloVe(name='6B', dim=300)
print('Finished loading GloVe vectors')
# Get relation embeddings for clustering
relation_embeddings = get_relation_embedding(relation_names, glove)
print(relation_embeddings.shape)

# Generate clusters
# This essentially goes through all train_data and get label set, which is a list of 1-80 ie. [80, 25, 75, 15, 62, 74, 5, 10...] 
relation_index = get_relation_index(train_data)  
# This uses KMeans to divide the label up into 10 disjoint clusters ie. {80: 1, 25: 5, 75: 3, 15: 1, 62: 1, 74: 1, 5: 1, 10: 2...}
# > relation_embeddings just return a dictionary of relation_index --> Glove embedding ie. { 80: embedding, 25: embedding, ...}
cluster_labels, relation_embeddings = create_relation_clusters(10, relation_embeddings, relation_index)
train_datasets, shuffle_index = prepare_rel_datasets(train_data, relation_names, cluster_labels, 10)

Finished loading the dataset
Loading GloVe vectors
Finished loading GloVe vectors
torch.Size([81, 300])




In [16]:
train_dataset = train_datasets[0]
train_dataloader = DataLoader(train_dataset, num_workers=1, batch_size=4, shuffle=False, collate_fn=rel_encode)

In [10]:
train_dataset[0]

(['in',
  '2004',
  'the',
  'catalan',
  'government',
  'gave',
  'him',
  'the',
  'george',
  'cross',
  '.'],
 ['applies', 'to', 'jurisdiction'],
 [['military', 'branch']])

In [47]:
for step, batch in enumerate(train_dataloader):
    text, labels, candidates = batch
    replicated_text, replicated_relations, ranking_label = replicate_rel_data(text,labels,candidates)
    print("replicated_text", replicated_text)
    print("replicated_relations", replicated_relations)
    print("ranking_label", ranking_label)
    output = tokenizer2.batch_encode_plus(list(zip(replicated_text, replicated_relations)), return_token_type_ids=False, 
                                          padding='longest', return_tensors='pt')
    print(output)
    print(len(output['input_ids']))
    print(len(output['input_ids'][0]))
    print([tokenizer2.convert_ids_to_tokens(x) for x in output['input_ids'].tolist()[0]])
    
#     for t in zip(replicated_text, replicated_relations):
#         print(t[0])
#         output_id = tokenizer2.encode_plus(list(t), return_token_type_ids=False, \
#                             truncation=True, padding='max_length', return_tensors='pt')
#         print(output_id)
#     print(replicated_text)
    break

replicated_text [['in', '2004', 'the', 'catalan', 'government', 'gave', 'him', 'the', 'george', 'cross', '.'], ['in', '2004', 'the', 'catalan', 'government', 'gave', 'him', 'the', 'george', 'cross', '.'], ['"', 'a', 'twentieth', 'century', 'history', 'of', 'berrien', 'county', 'michigan', '"', ',', 'p.', '262', '.'], ['"', 'a', 'twentieth', 'century', 'history', 'of', 'berrien', 'county', 'michigan', '"', ',', 'p.', '262', '.'], ['this', 'bridge', 'is', 'located', 'in', 'the', 'himalayan', 'mountains', 'between', 'the', 'dras', 'river', 'and', 'suru', 'river', 'in', 'the', 'ladakh', 'valley', 'in', 'the', 'indian', 'state', 'of', 'jammu', 'and', 'kashmir', '.'], ['this', 'bridge', 'is', 'located', 'in', 'the', 'himalayan', 'mountains', 'between', 'the', 'dras', 'river', 'and', 'suru', 'river', 'in', 'the', 'ladakh', 'valley', 'in', 'the', 'indian', 'state', 'of', 'jammu', 'and', 'kashmir', '.'], ['1997', ':', 'wales', 'voted', 'in', 'favour', 'of', 'a', 'welsh', 'assembly', 'in', 'a', 

In [5]:
tokenizer.encode("test")

[3231]

In [7]:
tokenizer.batch_encode_plus("test")

AttributeError: 'BertTokenizer' object has no attribute 'batch_encode_plus'

In [15]:
import numpy as np

In [17]:
np.array(["Test", "aspok"], dtype=object)

array(['Test', 'aspok'], dtype=object)