In [37]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
import copy
from typing import Dict, List, Tuple
import nltk
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (MODEL_WITH_LM_HEAD_MAPPING, WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer, 
                          PreTrainedModel, PreTrainedTokenizer, get_linear_schedule_with_warmup, BertModel, DistilBertModel, AlbertModel)

try:
    from torch.utils.tensorbord import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [28]:
class Args():
    
    model_type='albert'
    model_name_or_path='albert-base-v2'
    cache_dir=None
    run_id=1
    device=-1
    batch_size=32
    
    pickle_file_train='gender_stereo_data_for_inlp_train'
    pickle_file_dev='gender_stereo_data_for_inlp_dev'
    
args=Args()

In [29]:
def load_pickle(pickle_file):
    with open("./data_for_inlp/{}/{}.pickle".format(args.model_type, pickle_file), "rb") as f:
        data=pickle.load(f)
        txts=[d["text"] for d in data]
    return txts

In [30]:
device="cpu" if args.device==-1 else "cuda:{}".format(args.device)

config=AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
config.output_hidden_states='true'

pretrained_weights=args.model_name_or_path

if args.model_type=='bert':
    model=BertModel.from_pretrained(pretrained_weights, config=config, cache_dir=args.cache_dir)
elif args.model_type=='dbert':
    model=DistilBertModel.from_pretrained(pretrained_weights, config=config, cache_dir=args.cache_dir)

tokenizer=AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)

block_size=tokenizer.model_max_length

#model=BertModel.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir)

model.to(device)
model.eval()

rand_seed=args.run_id

In [31]:
if not os.path.exists("encodings"):
    os.makedirs("encodings")

In [32]:
data_file_train=load_pickle(args.pickle_file_train)
data_file_dev=load_pickle(args.pickle_file_dev)

  0%|          | 0/173 [02:08<?, ?it/s]


In [33]:
def encode(nlp_model, nlp_tokenizer, texts, args):
    
    all_H=[]
    nlp_model.eval()
    with torch.no_grad():
        
        print("Encoding...")
        batch_size=args.batch_size
        pbar=tqdm(range(len(texts)), ascii=True)
        
        for i in range(0, len(texts)-batch_size, batch_size):
            
            batch_texts=texts[i: i+batch_size]
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(batch_texts, padding=True, max_length=512, truncation=True)

            if args.model_type=='bert' or args.model_type=='albert':
                input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                token_type_ids=torch.tensor(token_type_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            elif args.model_type=='dbert':
                input_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                #below is the equivalent to the pooler_ouput from BERT for the DistilBert model; a fixed-size vector representation of the entire input sequence
                hiddens=nlp_model(input_ids=input_ids, attention_mask=attention_mask)[0]
                H=hiddens[:, 0, :]
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
            
            pbar.update(batch_size)
        
        remaining=texts[(len(texts)//args.batch_size)*args.batch_size:]
        if len(remaining)>0:
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(remaining, padding=True, max_length=512, truncation=True)

            if args.model_type=='bert' or args.model_type=='albert':
                input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                token_type_ids=torch.tensor(token_type_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            elif args.model_type=='dbert':
                input_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["attention_mask"]
                input_ids=torch.tensor(input_ids).to(device)
                attention_mask=torch.tensor(attention_mask).to(device)
                #below is the equivalent to the pooler_ouput from BERT for the DistilBert model; a fixed-size vector representation of the entire input sequence
                hiddens=nlp_model(input_ids=input_ids, attention_mask=attention_mask)[0]
                H=hiddens[:, 0, :]

            #H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
        
    H_np=np.concatenate(all_H)
    assert len(H_np.shape)==2
    assert len(H_np)==len(texts)
    return H_np

In [34]:
H_train=encode(model, tokenizer, data_file_train, args)

Encoding...


100%|#########9| 8672/8682 [14:20<00:00, 10.08it/s]


In [35]:
H_dev=encode(model, tokenizer, data_file_dev, args)

Encoding...


 99%|#########9| 960/965 [01:27<00:00, 11.03it/s]


In [36]:
if not os.path.exists("encodings/{}".format(args.model_type)):
    os.makedirs("encodings/{}".format(args.model_type))

path_train="encodings/{}/{}.npy".format(args.model_type, args.pickle_file_train)
np.save(path_train, H_train)

path_dev="encodings/{}/{}.npy".format(args.model_type, args.pickle_file_dev)
np.save(path_dev, H_dev)