In [1]:
import os
import pickle
import time
from scipy import stats
import numpy as np

import torch
torch.cuda.set_device(0) # won't use cuda:0 to initialize
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split

from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertModel, AdamW, DistilBertPreTrainedModel


In [2]:
class DistilBertForClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)
        # self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # self.init_weights() # change initial weights

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        distilbert_output = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
model_path = './distilbert_models/'
config = DistilBertConfig.from_json_file(os.path.join(model_path, 'config.json'))
model = DistilBertForClassification(config)
# state_dict = torch.load(os.path.join(model_path, 'pytorch_model.bin'), map_location='cpu')
state_dict = torch.load(os.path.join(model_path, 'pytorch_model.bin'))
model.load_state_dict(state_dict)
tokenizer = DistilBertTokenizer(os.path.join(model_path, 'vocab.txt'), do_lower_case=True)

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
tokenizer.save_pretrained('./tok/')

('./tok/vocab.txt', './tok/special_tokens_map.json', './tok/added_tokens.json')

In [4]:
tokenizer.save_vocabulary('./tok/')

('./tok/vocab.txt',)

In [5]:
output_dir = "./distilbert_models/"
tokenizer1 = DistilBertTokenizer.from_pretrained(output_dir, do_lower_case=True)
tokenizer1.save_pretrained('./tok/')

('./tok/vocab.txt', './tok/special_tokens_map.json', './tok/added_tokens.json')

In [3]:
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # , add_special_tokens=True
input_ids

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]])

In [5]:
output_dir = "./distilbert_models/"
tokenizer1 = DistilBertTokenizer.from_pretrained(output_dir, do_lower_case=True)
input_ids1 = torch.tensor(tokenizer1.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # , add_special_tokens=True
input_ids1

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]])

In [3]:
# loading data
with open('./e3_distilbert_process_data.pkl', 'rb') as f:
    bert_process_data = pickle.load(f)
    
# train_inputs = bert_process_data['train_inputs']
# train_labels = bert_process_data['train_labels']
# train_masks = bert_process_data['train_masks']

val_inputs = bert_process_data['val_inputs']
val_labels = bert_process_data['val_labels']
val_masks = bert_process_data['val_masks']

test_inputs, test_masks = bert_process_data['test_inputs'], bert_process_data['test_masks']
test_labels, test_lnis = bert_process_data['test_labels'], bert_process_data['test_lnis']

In [4]:
batch_size = 16

# Create the DataLoader for our training set.
# train_data = TensorDataset(train_inputs, train_masks, train_labels)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(val_inputs, val_masks, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
len(test_dataloader)

5849

In [27]:
class DistilBertForClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # self.distilbert = DistilBertModel(config)
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # self.init_weights() # change initial weights

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        distilbert_output = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
#     def load_pretrain_wights(self):
#         self.distilbert.from_pretrained('distilbert-base-uncased')

configuration = DistilBertConfig(num_labels=133)
model = DistilBertForClassification(configuration)
# model.from_pretrained('distilbert-base-uncased')
params = list(model.named_parameters())


In [28]:
for p in params[5:21]:
    print(p[0], p[1].size(), p[1].requires_grad, p[1][:10])
    break

distilbert.transformer.layer.0.attention.q_lin.bias torch.Size([768]) True tensor([ 0.5412, -0.2970, -0.4073,  0.3460, -0.2974,  0.4035,  0.0176,  0.2931,
         0.2445, -0.1061], grad_fn=<SliceBackward>)


In [None]:
def check_params(model):
    params = list(model.named_parameters())
    p = params[5]
    print(p[0], p[1].size(), p[1].requires_grad, p[1][:10])
    p = params[6]
    print(p[0], p[1].size(), p[1].requires_grad, p[1][0, :10])

In [6]:
# Initializing a DistilBERT configuration
# configuration = DistilBertConfig()
configuration = DistilBertConfig(num_labels=133)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model = DistilBertForClassification(configuration)
model.load_pretrain_wights()
model = nn.DataParallel(model, device_ids=[0, 1], dim=0)
model.to(device)

DataParallel(
  (module): DistilBertForClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (dropout): Dropout(p=0.1, inplace=False)
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, 

In [4]:
class DistilBertForClassification(DistilBertPreTrainedModel):
    def __init__(self, config, hidden_size=768, num_labels=133, dropout_prob=0.5):
        super().__init__(config)
        # self.distilbert = DistilBertModel(config)
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_prob)
        # self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        distilbert_output = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [5]:
# model.config
# configuration = DistilBertConfig()
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = DistilBertForClassification(configuration)
# model.to(device)

In [2]:
class DistilBertForClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # self.distilbert = DistilBertModel(config)
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # self.init_weights() # change initial weights

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        distilbert_output = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

output_dir = "./distilbert_models/"
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained(output_dir, do_lower_case=True, add_special_tokens=True)
device = torch.device('cpu')
model = DistilBertForClassification.from_pretrained(output_dir)
print(model.config)
model.to(device)

DistilBertConfig {
  "activation": "gelu",
  "architectures": null,
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "dim": 768,
  "do_sample": false,
  "dropout": 0.1,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33",
    "34": "LAB

DistilBertForClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (drop

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]])

In [7]:
criterion = nn.BCEWithLogitsLoss()

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [8]:
def train_epoch(model, device, epoch, train_dataloader, validation_dataloader, 
                criterion, optimizer, clip=5.):
    model.train()
    train_loss = 0
    t0 = time.time()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader, 1):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        loss = criterion(outputs, b_labels)
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        # Update the learning rate.
        # scheduler.step()
        if step % 1000 == 0:
            # print loss info every 20 Iterations
            log_str = "Epoch : {} , Iteration : {} , Time : {:.2f} , TrainLoss : {:.4f}".format \
                        (epoch, step, time.time()-t0, train_loss/step)
            print(log_str)
            t0 = time.time()
            break
        train_loss /= len(train_dataloader)

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(validation_dataloader, 1):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            loss = criterion(outputs, b_labels)
            eval_loss += loss.item()
        eval_loss /= len(validation_dataloader)

    return model, optimizer, train_loss, eval_loss

In [11]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os

def save_transformers(model, tokenizer):
    output_dir = "./models/"

    # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
    # If we have a distributed model, save only the encapsulated model
    # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
    model_to_save = model.module if hasattr(model, 'module') else model

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(output_dir)

In [11]:
# print(hasattr(model, 'module'))
# model_to_save = model.module
# print(model_to_save.config)

In [12]:
n_epochs = 1
clip = 2.0
# save_path = './ft_distilbert_0219.pt'
best_eval_loss = float('inf')
for epoch in range(1, 1+n_epochs):
    model, optimizer, train_loss, eval_loss = train_epoch(model, device, epoch, 
                                                          train_dataloader, validation_dataloader, 
                                                          criterion, optimizer, clip=clip)

    print(">> Epoch : {} , TrainLoss : {:.4f} , EvalLoss : {:.4f}\n".format \
          (epoch, train_loss, eval_loss))

    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        save_transformers(model, tokenizer)
        # torch.save(model.state_dict(), save_path)

Epoch : 1 , Iteration : 1000 , Time : 99.68 , TrainLoss : 0.0000
>> Epoch : 1 , TrainLoss : 0.0435 , EvalLoss : 0.0334



In [5]:
# output_dir = "./distilbert_models/"
# model = DistilBertForClassification.from_pretrained(output_dir)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

In [5]:
def predict_res(model, data_loader, device):
    model.eval()
    y_true = None
    y_pred = None
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            
            if y_true is None:
                y_true = b_labels
            else:
                y_true = torch.cat((y_true, b_labels), 0)
            # outputs = model(seqs, seq_lens)

            if y_pred is None:
                y_pred = outputs
            else:
                y_pred = torch.cat((y_pred, outputs), 0)
            # break

    print(y_true.size(), y_pred.size())
    y_pred = torch.sigmoid(y_pred)
    return y_true.cpu().numpy(), y_pred.cpu().numpy()

In [6]:
def mean_column_wise_auc(y_true, y_pred):
    assert y_true.shape[1] == y_pred.shape[1],'Arrays must have the same dimension'
    list_of_aucs = []
    for column in range(y_true.shape[1]):
        #print(sum(y_true[:,column]), sum(y_pred[:,column]))
        if sum(y_true[:,column]) == 0:
            continue
        list_of_aucs.append(roc_auc_score(y_true[:,column],y_pred[:,column]))
    # print(list_of_aucs)
    return np.array(list_of_aucs).mean(), len((list_of_aucs))

def fit_active_value(s1, active_value):
    # return np.round(s1)
    return (s1 > active_value).astype(int)

def cut_max_num(s1, max_num):
    max_ids = np.argsort(s1)[-max_num:]
    s3 = np.zeros_like(s1)
    # for i in max_ids:
    #     s3[i] = 1
    s3[max_ids] = 1
    return s3

def cal_avg_p_r(arr_true, arr_pred, max_num=6, active_value=0.05):
    ps, rs = [], []
    for i in range(arr_true.shape[0]):
        t1, s1 = arr_true[i], arr_pred[i]
        if sum(t1) <= 0:
            continue
        s2 = fit_active_value(s1, active_value)
        if sum(s2) > max_num:
            s2 = cut_max_num(s1, max_num)
        # if sum(s2) > 4:
        #     s2 = get_maxf_adj(s1, s2, adj_phrase_map, phrase_prob, cannot_be_only)
        # s2 = fit_threshold(s1, thresholds)

        p, r = precision_score(t1, s2), recall_score(t1, s2)
        ps.append(p)
        rs.append(r)
    return np.average(ps), np.average(rs), len(ps), ps, rs

In [7]:
next(model.parameters()).is_cuda

False

In [9]:
device = torch.device('cuda')
model.to(device)
next(model.parameters()).is_cuda

True

In [10]:
val_true, val_pred = predict_res(model, validation_dataloader, device)

torch.Size([48096, 133]) torch.Size([48096, 133])


In [11]:
# test_dataloader
test_true, test_pred = predict_res(model, test_dataloader, device)

torch.Size([93581, 133]) torch.Size([93581, 133])


In [12]:
val_auc = roc_auc_score(val_true, val_pred)
test_auc = mean_column_wise_auc(test_true, test_pred)
val_auc, test_auc

(0.9758983430744022, (0.983780945344874, 129))

In [13]:
val_pr = cal_avg_p_r(val_true, val_pred, max_num=6, active_value=0.02)
val_pr[:3] # (0.5365099661787537, 0.9350487599536378, 48096)

(0.5365099661787537, 0.9350487599536378, 48096)

In [14]:
test_pr = cal_avg_p_r(test_true, test_pred, max_num=6, active_value=0.02)
test_pr[:3] # (0.5380684469425774, 0.9606396919874013, 93581)

(0.5380684469425774, 0.9606396919874013, 93581)

In [33]:
configuration = DistilBertConfig(num_labels=6)
configuration

DistilBertConfig {
  "activation": "gelu",
  "architectures": null,
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "dim": 768,
  "do_sample": false,
  "dropout": 0.1,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "num_beams": 1,
  "num_labels": 6,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "qa_dropout": 0.1,
  "repetition_penalty": 1.0,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_em

In [6]:
# Initializing a model from the configuration
model = DistilBertForClassification(configuration)

# Accessing the model configuration
configuration = model.config

In [11]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = "./models/"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)



In [12]:
# torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
# tokenizer.save_vocabulary(output_dir)

In [31]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
params = list(model.named_parameters())
for p in params[6:21]:
    #print(p[0], p[1].size(), p[1][:10])
    print(p[0], p[1].size(), p[1][0, :10])
    
    break
    
# transformer.layer.0.attention.q_lin.bias torch.Size([768]) tensor([ 0.5412, -0.2970, -0.4073,  0.3460, -0.2974,  0.4035,  0.0176,  0.2931,
#          0.2445, -0.1061], grad_fn=<SliceBackward>)

# transformer.layer.0.attention.k_lin.weight torch.Size([768, 768]) tensor([ 0.0342, -0.0546, -0.0055,  0.0043,  0.0282, -0.0536,  0.0542,  0.0100,
#         -0.0263,  0.0193], grad_fn=<SliceBackward>)


transformer.layer.0.attention.k_lin.weight torch.Size([768, 768]) tensor([ 0.0342, -0.0546, -0.0055,  0.0043,  0.0282, -0.0536,  0.0542,  0.0100,
        -0.0263,  0.0193], grad_fn=<SliceBackward>)


In [28]:
for p in params[:21]:
    print(p[0], p[1].size(), p[1].requires_grad)
#     print(p[1])
#     break

embeddings.word_embeddings.weight torch.Size([30522, 768]) True
embeddings.position_embeddings.weight torch.Size([512, 768]) True
embeddings.LayerNorm.weight torch.Size([768]) True
embeddings.LayerNorm.bias torch.Size([768]) True
transformer.layer.0.attention.q_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.q_lin.bias torch.Size([768]) True
transformer.layer.0.attention.k_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.k_lin.bias torch.Size([768]) True
transformer.layer.0.attention.v_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.v_lin.bias torch.Size([768]) True
transformer.layer.0.attention.out_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.out_lin.bias torch.Size([768]) True
transformer.layer.0.sa_layer_norm.weight torch.Size([768]) True
transformer.layer.0.sa_layer_norm.bias torch.Size([768]) True
transformer.layer.0.ffn.lin1.weight torch.Size([3072, 768]) True
transformer.layer.0.ffn.lin1.bias t

In [7]:
class FtDistilBert(nn.Module):
    def __init__(self, hidden_size=768, num_labels=133, dropout_prob=0.5):
        super(FtDistilBert, self).__init__()
        self.distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
model_path = './ft_distilbert_0219.pt'
pmodel = FtDistilBert()
pmodel.load_state_dict(torch.load(model_path, map_location='cpu'))

<All keys matched successfully>

In [8]:
ppparams = list(pmodel.named_parameters())

In [30]:
# for p in ppparams[5:21]:
#     print(p[0], p[1].size())
#     print(p[1])
#     break
for p in ppparams[:21]:
    print(p[0], p[1].size(), p[1].requires_grad)

distilbert_model.embeddings.word_embeddings.weight torch.Size([30522, 768]) True
distilbert_model.embeddings.position_embeddings.weight torch.Size([512, 768]) True
distilbert_model.embeddings.LayerNorm.weight torch.Size([768]) True
distilbert_model.embeddings.LayerNorm.bias torch.Size([768]) True
distilbert_model.transformer.layer.0.attention.q_lin.weight torch.Size([768, 768]) True
distilbert_model.transformer.layer.0.attention.q_lin.bias torch.Size([768]) True
distilbert_model.transformer.layer.0.attention.k_lin.weight torch.Size([768, 768]) True
distilbert_model.transformer.layer.0.attention.k_lin.bias torch.Size([768]) True
distilbert_model.transformer.layer.0.attention.v_lin.weight torch.Size([768, 768]) True
distilbert_model.transformer.layer.0.attention.v_lin.bias torch.Size([768]) True
distilbert_model.transformer.layer.0.attention.out_lin.weight torch.Size([768, 768]) True
distilbert_model.transformer.layer.0.attention.out_lin.bias torch.Size([768]) True
distilbert_model.trans

In [29]:
class DistilBertForClassification(DistilBertPreTrainedModel):
    def __init__(self, config, hidden_size=768, num_labels=133, dropout_prob=0.5):
        super().__init__(config)
        # self.distilbert = DistilBertModel(config)
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_prob)
        # self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        distilbert_output = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, hidden_size)
        pooled_output = hidden_state[:, 0]  # (bs, hidden_size)
        pooled_output = F.relu(self.pre_classifier(pooled_output))  # (bs, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
output_dir = "./distilbert_models/"
cmodel = DistilBertForClassification.from_pretrained(output_dir)
ccparams = list(cmodel.named_parameters())

In [35]:
[cmodel.get_output_embeddings()]

[None]

In [31]:
for p in params[:21]:
    print(p[0], p[1].size(), p[1].requires_grad)

embeddings.word_embeddings.weight torch.Size([30522, 768]) True
embeddings.position_embeddings.weight torch.Size([512, 768]) True
embeddings.LayerNorm.weight torch.Size([768]) True
embeddings.LayerNorm.bias torch.Size([768]) True
transformer.layer.0.attention.q_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.q_lin.bias torch.Size([768]) True
transformer.layer.0.attention.k_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.k_lin.bias torch.Size([768]) True
transformer.layer.0.attention.v_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.v_lin.bias torch.Size([768]) True
transformer.layer.0.attention.out_lin.weight torch.Size([768, 768]) True
transformer.layer.0.attention.out_lin.bias torch.Size([768]) True
transformer.layer.0.sa_layer_norm.weight torch.Size([768]) True
transformer.layer.0.sa_layer_norm.bias torch.Size([768]) True
transformer.layer.0.ffn.lin1.weight torch.Size([3072, 768]) True
transformer.layer.0.ffn.lin1.bias t