In [1]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tensorboardX

Note: you may need to restart the kernel to use updated packages.


In [15]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
import copy
from typing import Dict, List, Tuple
import nltk
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (MODEL_WITH_LM_HEAD_MAPPING, WEIGHTS_NAME, AdamW, AutoConfig, AutoModelWithLMHead, AutoTokenizer, 
                          PreTrainedModel, PreTrainedTokenizer, get_linear_schedule_with_warmup, BertModel)

try:
    from torch.utils.tensorbord import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [16]:
class Args():
    
    model_type='bert'
    model_name_or_path="bert-base-uncased"
    cache_dir=None
    run_id=1
    device=-1
    batch_size=32
    
    pickle_file_train='gender_data_for_inlp_train.pickle'
    pickle_file_dev='gender_data_for_inlp_dev.pickle'
    
args=Args()

In [17]:
def load_pickle(pickle_file):
    with open(pickle_file, "rb") as f:
        data=pickle.load(f)
        txts=[d["text"] for d in data]
    return txts

In [18]:
device="cpu" if args.device==-1 else "cuda:{}".format(args.device)

config=AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
config.output_hidden_states='true'

tokenizer=AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)

block_size=tokenizer.model_max_length

model=BertModel.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, 
                               cache_dir=args.cache_dir)

model.to(device)
model.eval()

rand_seed=args.run_id

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
if not os.path.exists("encodings"):
    os.makedirs("encodings")

In [20]:
data_file_train=load_pickle(args.pickle_file_train)
data_file_dev=load_pickle(args.pickle_file_dev)

In [21]:
def encode(nlp_model, nlp_tokenizer, texts, args):
    
    all_H=[]
    nlp_model.eval()
    with torch.no_grad():
        
        print("Encoding...")
        batch_size=args.batch_size
        pbar=tqdm(range(len(texts)), ascii=True)
        
        for i in range(0, len(texts)-batch_size, batch_size):
            
            batch_texts=texts[i: i+batch_size]
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(batch_texts, padding=True, max_length=512, truncation=True)
            input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
            
            input_ids=torch.tensor(input_ids).to(device)
            token_type_ids=torch.tensor(token_type_ids).to(device)
            attention_mask=torch.tensor(attention_mask).to(device)
            
            H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
            
            pbar.update(batch_size)
        
        remaining=texts[(len(texts)//args.batch_size)*args.batch_size:]
        if len(remaining)>0:
            
            batch_encoding=nlp_tokenizer.batch_encode_plus(remaining, padding=True, max_length=512, truncation=True)
            input_ids, token_type_ids, attention_mask=batch_encoding["input_ids"], batch_encoding["token_type_ids"], batch_encoding["attention_mask"]
            
            input_ids=torch.tensor(input_ids).to(device)
            token_type_ids=torch.tensor(token_type_ids).to(device)
            attention_mask=torch.tensor(attention_mask).to(device)
            
            H=nlp_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            
            assert len(H.shape)==2
            all_H.append(H.detach().cpu().numpy())
        
    H_np=np.concatenate(all_H)
    assert len(H_np.shape)==2
    assert len(H_np)==len(texts)
    return H_np

In [22]:
H_train=encode(model, tokenizer, data_file_train, args)

Encoding...





  0%|          | 0/8977 [00:00<?, ?it/s][A[A[A

  3%|2         | 256/8977 [00:28<07:16, 19.99it/s][A[A


  0%|          | 32/8977 [00:02<12:30, 11.92it/s][A[A[A


  1%|          | 64/8977 [00:04<09:07, 16.29it/s][A[A[A


  1%|1         | 96/8977 [00:05<07:32, 19.63it/s][A[A[A


  1%|1         | 128/8977 [00:07<07:45, 19.01it/s][A[A[A


  2%|1         | 160/8977 [00:08<07:52, 18.65it/s][A[A[A


  2%|2         | 192/8977 [00:10<07:08, 20.51it/s][A[A[A


  2%|2         | 224/8977 [00:11<07:22, 19.78it/s][A[A[A


  3%|2         | 256/8977 [00:13<06:49, 21.30it/s][A[A[A


  3%|3         | 288/8977 [00:14<06:22, 22.69it/s][A[A[A


  4%|3         | 320/8977 [00:16<07:46, 18.54it/s][A[A[A


  4%|3         | 352/8977 [00:18<07:37, 18.86it/s][A[A[A


  4%|4         | 384/8977 [00:20<07:36, 18.81it/s][A[A[A


  5%|4         | 416/8977 [00:21<07:31, 18.95it/s][A[A[A


  5%|4         | 448/8977 [00:24<09:06, 15.60it/s][A[A[A


  5%|5         | 480/8

In [23]:
H_dev=encode(model, tokenizer, data_file_dev, args)

Encoding...





  0%|          | 0/997 [00:00<?, ?it/s][A[A[A


  3%|3         | 32/997 [00:01<00:50, 19.17it/s][A[A[A


  6%|6         | 64/997 [00:03<00:53, 17.59it/s][A[A[A


 10%|9         | 96/997 [00:05<00:47, 18.92it/s][A[A[A


 13%|#2        | 128/997 [00:07<00:48, 18.05it/s][A[A[A


 16%|#6        | 160/997 [00:09<00:49, 17.00it/s][A[A[A


 19%|#9        | 192/997 [00:10<00:45, 17.72it/s][A[A[A


 22%|##2       | 224/997 [00:13<00:54, 14.21it/s][A[A[A


 26%|##5       | 256/997 [00:17<00:58, 12.58it/s][A[A[A


 29%|##8       | 288/997 [00:19<00:57, 12.23it/s][A[A[A


 32%|###2      | 320/997 [00:21<00:49, 13.65it/s][A[A[A


 35%|###5      | 352/997 [00:23<00:46, 13.98it/s][A[A[A


 39%|###8      | 384/997 [00:25<00:38, 15.80it/s][A[A[A


 42%|####1     | 416/997 [00:26<00:32, 17.83it/s][A[A[A


 45%|####4     | 448/997 [00:28<00:31, 17.52it/s][A[A[A


 48%|####8     | 480/997 [00:29<00:28, 18.44it/s][A[A[A


 51%|#####1    | 512/997 [00:31<00

In [25]:
path_train="encodings/{}_{}.npy".format(args.model_type, args.pickle_file_train)
np.save(path_train, H_train)

path_dev="encodings/{}_{}.npy".format(args.model_type, args.pickle_file_dev)
np.save(path_dev, H_dev)

