In [1]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
import copy
from typing import Dict, List, Tuple
import nltk
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (MODEL_WITH_LM_HEAD_MAPPING, WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer, 
                          PreTrainedModel, PreTrainedTokenizer, get_linear_schedule_with_warmup, BertModel, DistilBertModel, AlbertModel)

try:
    from torch.utils.tensorbord import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [5]:
class Args():
    
    model_type='bert'
    model_name_or_path="bert-base-uncased"
    cache_dir=None
    run_id=1
    device=-1
    batch_size=32
    
    pickle_file_train='ethnicities_phys_data_for_inlp_train'
    pickle_file_dev='ethnicities_phys_data_for_inlp_dev'
    
args=Args()

In [6]:
def load_pickle(pickle_file):
    with open("./data_for_inlp/{}/{}.pickle".format(args.model_type, pickle_file), "rb") as f:
        data=pickle.load(f)
        txts=[d["text"] for d in data]
    return txts

In [7]:
device="cpu" if args.device==-1 else "cuda:{}".format(args.device)

config=AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
config.output_hidden_states='true'

pretrained_weights=args.model_name_or_path

if args.model_type=='bert':
    model=BertModel.from_pretrained(pretrained_weights, config=config, cache_dir=args.cache_dir)
elif args.model_type=='dbert':
    model=DistilBertModel.from_pretrained(pretrained_weights, config=config, cache_dir=args.cache_dir)

tokenizer=AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)

block_size=tokenizer.model_max_length

#model=BertModel.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir)

model.to(device)
model.eval()

rand_seed=args.run_id

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
if not os.path.exists("encodings"):
    os.makedirs("encodings")

In [9]:
data_file_train=load_pickle(args.pickle_file_train)
data_file_dev=load_pickle(args.pickle_file_dev)

In [10]:
def encode(nlp_model, nlp_tokenizer, texts, args):
    
    all_H=[]
    nlp_model.eval()
    with torch.no_grad():
        
        print("Encoding...")
        batch_size=args.batch_size
        pbar=tqdm(range(len(texts)), ascii=True)
        
        for i in range(0, len(texts)-batch_size, batch_size):
            
            batch_texts=texts[i: i+batch_size]
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(batch_texts, padding=True, max_length=512, truncation=True)

            if args.model_type=='bert':
                input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                token_type_ids=torch.tensor(token_type_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            elif args.model_type=='dbert':
                input_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                #below is the equivalent to the pooler_ouput from BERT for the DistilBert model; a fixed-size vector representation of the entire input sequence
                hiddens=nlp_model(input_ids=input_ids, attention_mask=attention_mask)[0]
                H=hiddens[:, 0, :]
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
            
            pbar.update(batch_size)
        
        remaining=texts[(len(texts)//args.batch_size)*args.batch_size:]
        if len(remaining)>0:
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(remaining, padding=True, max_length=512, truncation=True)

            if args.model_type=='bert':
                input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                token_type_ids=torch.tensor(token_type_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            elif args.model_type=='dbert':
                input_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                #below is the equivalent to the pooler_ouput from BERT for the DistilBert model; a fixed-size vector representation of the entire input sequence
                hiddens=nlp_model(input_ids=input_ids, attention_mask=attention_mask)[0]
                H=hiddens[:, 0, :]

            #H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
        
    H_np=np.concatenate(all_H)
    assert len(H_np.shape)==2
    assert len(H_np)==len(texts)
    return H_np

In [11]:
H_train=encode(model, tokenizer, data_file_train, args)

Encoding...


 94%|#########4| 64/68 [00:09<00:00,  6.44it/s]


In [12]:
H_dev=encode(model, tokenizer, data_file_dev, args)

Encoding...


  0%|          | 0/8 [00:00<?, ?it/s]


In [13]:
H_train

array([[-0.80399084, -0.60692686, -0.89353573, ..., -0.656607  ,
        -0.6405367 ,  0.63855124],
       [-0.8549681 , -0.30960402, -0.16254   , ..., -0.37514246,
        -0.5749633 ,  0.8057628 ],
       [-0.8629364 , -0.39871433, -0.9563958 , ..., -0.9303482 ,
        -0.70047444,  0.735096  ],
       ...,
       [-0.7412005 , -0.31360584, -0.77346873, ..., -0.62608105,
        -0.5615992 ,  0.76570374],
       [-0.77566355, -0.40765655, -0.8068566 , ..., -0.13706821,
        -0.6080266 ,  0.5931715 ],
       [-0.8787884 , -0.5682052 , -0.9553962 , ..., -0.73718697,
        -0.6067792 ,  0.6446022 ]], dtype=float32)

In [98]:
if not os.path.exists("encodings/{}".format(args.model_type)):
    os.makedirs("encodings/{}".format(args.model_type))

path_train="encodings/{}/{}.npy".format(args.model_type, args.pickle_file_train)
np.save(path_train, H_train)

path_dev="encodings/{}/{}.npy".format(args.model_type, args.pickle_file_dev)
np.save(path_dev, H_dev)