In [1]:
import os
import pickle
import time
from scipy import stats
import numpy as np

import torch
torch.cuda.set_device(0) # won't use cuda:0 to initialize
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split

from transformers import (AlbertConfig, AlbertTokenizer, AlbertPreTrainedModel, AdamW, AlbertModel,
                          get_linear_schedule_with_warmup)

In [2]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
input_ids

tensor([[    2, 10975,    15,    51,  1952,    25, 10901,     3]])

In [2]:
model = AlbertModel.from_pretrained('albert-xlarge-v2')

HBox(children=(IntProgress(value=0, description='Downloading', max=535, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=236197176, style=ProgressStyle(description_…




In [3]:
outputs = model(input_ids)
pooled_output = outputs[1]  # The last hidden-state is the first element of the output tuple
pooled_output.size()

torch.Size([1, 768])

In [3]:
tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id

(2, 3, 0)

In [4]:
class AlBertForClassification(AlbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # self.albert = AlbertModel(config)
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        # self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # self.init_weights() # change initial weights

    def forward(self, input_ids=None, attention_mask=None):
        outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1] # (bs, hidden_size)
        # pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [5]:
configuration = AlbertConfig(hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
    num_labels=133)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AlBertForClassification(configuration)
model.to(device)

AlBertForClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=Tr

In [4]:
params = list(model.named_parameters())
for p in params[9:]:
    print(p[0], p[1].size(), p[1].requires_grad, p[1][0, :10])
    break

encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight torch.Size([768, 768]) True tensor([ 0.0042,  0.0179, -0.0147,  0.0340,  0.0254, -0.0347,  0.0232, -0.0201,
        -0.0471, -0.0234], grad_fn=<SliceBackward>)


In [5]:
for p in params[:]:
    print(p[0], p[1].size(), p[1].requires_grad)

embeddings.word_embeddings.weight torch.Size([30000, 128]) True
embeddings.position_embeddings.weight torch.Size([512, 128]) True
embeddings.token_type_embeddings.weight torch.Size([2, 128]) True
embeddings.LayerNorm.weight torch.Size([128]) True
embeddings.LayerNorm.bias torch.Size([128]) True
encoder.embedding_hidden_mapping_in.weight torch.Size([768, 128]) True
encoder.embedding_hidden_mapping_in.bias torch.Size([768]) True
encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight torch.Size([768]) True
encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias torch.Size([768]) True
encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight torch.Size([768, 768]) True
encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias torch.Size([768]) True
encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight torch.Size([768, 768]) True
encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias torch.Size([768]) True
encod

In [7]:
def train_epoch(model, device, epoch, train_dataloader, validation_dataloader, 
                criterion, optimizer, scheduler, clip=5.):
    model.train()
    train_loss = 0
    t0 = time.time()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader, 1):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()        
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        loss = criterion(outputs, b_labels)
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

        if step % 10 == 0:
            # print loss info every 20 Iterations
            log_str = "Epoch : {} , Iteration : {} , Time : {:.2f} , TrainLoss : {:.9f}".format \
                        (epoch, step, time.time()-t0, train_loss/step)
            print(log_str)
            t0 = time.time()
            break
        train_loss /= len(train_dataloader)

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(validation_dataloader, 1):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            loss = criterion(outputs, b_labels)
            eval_loss += loss.item()
            break
        eval_loss /= len(validation_dataloader)

    return model, optimizer, train_loss, eval_loss

In [None]:
n_epochs = 1

criterion = nn.BCEWithLogitsLoss()

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * n_epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


clip = 2.0
for epoch in range(1, 1+n_epochs):
    model, optimizer, train_loss, eval_loss = train_epoch(model, device, epoch, 
                                                          train_dataloader, validation_dataloader, 
                                                          criterion, optimizer, scheduler, clip=clip)

# albert-xlarge-v2

In [3]:
## cuda out of memory
# class AlBertForClassification(AlbertPreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)
#         # self.albert = AlbertModel(config)
#         self.albert = AlbertModel.from_pretrained('albert-xlarge-v2')
#         # self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
#         self.dropout = nn.Dropout(config.classifier_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
#         # self.init_weights() # change initial weights

#     def forward(self, input_ids=None, attention_mask=None):
#         outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs[1] # (bs, hidden_size)
#         # pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)
#         return logits

In [4]:
# configuration = AlbertConfig(hidden_size=2048,
#     num_attention_heads=16,
#     num_hidden_layers=24, 
#     intermediate_size=8192,
#     num_labels=133)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = AlBertForClassification(configuration)
# model.to(device)

AlBertForClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=2048, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=2048, out_features=2048, bias=True)
                (key): Linear(in_features=2048, out_features=2048, bias=True)
                (value): Linear(in_features=2048, out_features=2048,

# prepare data

In [5]:
train_path = './model_data.pkl'
with open(train_path, 'rb') as f:
    traindata = pickle.load(f)
len(traindata)

362560

In [7]:
def prepare_data(tempdata, phrase_rule_133, max_sent_num=6):
    half_ = max_sent_num // 2
    lnis = []
    labels = []
    inputs = []
    num_classes = len(phrase_rule_133)
    for lni, items in tempdata.items():
        label = [0] * num_classes
        phrase = [phrase_rule_133[p] for p in items['phrases'] if p in phrase_rule_133]
        if not phrase:
            continue

        if len(items['pred_sents']) > max_sent_num:
            sent = ' '.join(s for _, s in items['pred_sents'][:half_] + items['pred_sents'][-half_:])
        else:
            sent = ' '.join(s for _, s in items['pred_sents'])

        if not sent:
            continue

        inputs.append(sent)

        for i in set(phrase):
            label[i] = 1
        assert sum(label) > 0
        labels.append(label[:])
        lnis.append(lni)

    print(len(labels), len(inputs), len(lnis))
    return lnis, inputs, labels

In [8]:
_, inputs, labels = prepare_data(traindata, phrase_rule_133)
inputs[10]

320635 320635 320635


"denise wilson has filed a motion for an extension of time to file objections to the magistrate judge's proposed findings and recommended disposition . that motion is granted . document ."

In [9]:
test_path = './model_data_ftc.pkl'
with open(test_path, 'rb') as f:
    testdata = pickle.load(f)
print(len(testdata))

test_lnis, test_inputs, test_labels = prepare_data(testdata, phrase_rule_133)
test_inputs[10]

112089
93581 93581 93581


"extension of time granted to april , to file the reply to the informal response to the petition for writ of habeas corpus . extension is granted based upon deputy state public defender debra s . sabah press's representation that she anticipates filing that brief by ."

In [11]:
def sent2tokenids(inputs, tokenizer):
    input_ids = [] # List[List[int]]

    for i, sent in enumerate(inputs):
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                       )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)
    # print(len(input_ids))
    return input_ids

# test_input_ids = sent2tokenids(test_inputs, tokenizer)
len(test_input_ids)

93581

In [15]:
# train_input_ids = sent2tokenids(inputs, tokenizer)

print(train_input_ids[10])
lengths = [len(x) for x in train_input_ids]
stats.describe(lengths)

[2, 17871, 2571, 63, 5263, 21, 2422, 26, 40, 3896, 16, 85, 20, 3893, 20045, 20, 14, 12393, 1878, 22, 18, 2097, 10172, 17, 5773, 22157, 13, 9, 30, 2422, 25, 2743, 13, 9, 4492, 13, 9, 3]


DescribeResult(nobs=320635, minmax=(3, 13206), mean=141.37591965942582, variance=15473.276440663463, skewness=4.906607916199845, kurtosis=384.89478880556277)

In [16]:
def prepare_attention_mask(new_inp_ids, pad_id):
    assert pad_id == 0, 'bug of pad id'
    # Create attention masks
    attention_masks = []

    # For each sentence...
    for sent in new_inp_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > pad_id) for token_id in sent]

        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    print('Length of Attention mask : {}'.format(len(attention_masks)))
    return attention_masks


def truncate_pad_id(input_ids, tokenizer, max_len=256):
    half_len = max_len // 2

    # truncation and padding
    pad_id = tokenizer.pad_token_id # 0

    new_inp_ids = []
    for x in input_ids:
        ll = len(x)
        if ll > max_len:
            new_inp_ids.append(x[:half_len] + x[-half_len:])
        elif ll < max_len:
            new_inp_ids.append(x + [pad_id] * (max_len-ll))
        else:
            new_inp_ids.append(x)
        assert len(new_inp_ids[-1]) == max_len

    print('Length of inputs : {}'.format(len(new_inp_ids)))
    attention_masks = prepare_attention_mask(new_inp_ids, pad_id)
    return new_inp_ids, attention_masks

In [17]:
new_train_inp_ids, train_attention_masks = truncate_pad_id(train_input_ids, tokenizer)

Length of inputs : 320635
Length of Attention mask : 320635


In [18]:
new_test_inp_ids, test_attention_masks = truncate_pad_id(test_input_ids, tokenizer)

Length of inputs : 93581
Length of Attention mask : 93581


In [20]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(new_train_inp_ids, labels, 
                                                            random_state=2020, test_size=0.15)
# Do the same for the masks.
train_masks, val_masks, _, _ = train_test_split(train_attention_masks, labels,
                                             random_state=2020, test_size=0.15)

len(train_inputs), len(train_masks), len(train_labels), len(val_labels)

(272539, 272539, 272539, 48096)

In [21]:
train_inputs = torch.tensor(train_inputs).long()
val_inputs = torch.tensor(val_inputs).long()

train_labels = torch.tensor(train_labels).float()
val_labels = torch.tensor(val_labels).float()

train_masks = torch.tensor(train_masks).long()
val_masks = torch.tensor(val_masks).long()

test_inputs = torch.tensor(new_test_inp_ids).long()
test_masks = torch.tensor(test_attention_masks).long()
test_labels = torch.tensor(test_labels).float()

In [22]:
# _process_data = {'train_inputs': train_inputs, 'val_inputs': val_inputs, 'test_inputs': test_inputs, 
#                  'train_labels': train_labels, 'val_labels': val_labels, 'test_labels': test_labels, 
#                  'train_masks': train_masks, 'val_masks': val_masks, 'test_masks': test_masks, 
#                  'test_lnis': test_lnis}
# with open('/data/charley/half_data/e3_albert_process_data.pkl', 'wb') as f:
#     pickle.dump(_process_data, f)

In [5]:
# loading data
with open('./process_data.pkl', 'rb') as f:
    _process_data = pickle.load(f)
    
train_inputs = _process_data['train_inputs']
train_labels = _process_data['train_labels']
train_masks = _process_data['train_masks']

val_inputs = _process_data['val_inputs']
val_labels = _process_data['val_labels']
val_masks = _process_data['val_masks']

# test_inputs, test_masks = _process_data['test_inputs'], _process_data['test_masks']
# test_labels, test_lnis = _process_data['test_labels'], _process_data['test_lnis']

In [6]:
batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(val_inputs, val_masks, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# test_data = TensorDataset(test_inputs, test_masks, test_labels)
# test_sampler = SequentialSampler(test_data)
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# len(test_dataloader)