In [1]:
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import datetime
import torch.nn as nn
from transformers import *



In [2]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060


In [4]:
class Config:
    #--------------------------------
    #  Transformer Encoder parameters
    #--------------------------------

    encoder_model_name = "bert-base-cased"  # change the model name if you want to use a different encoder

    max_seq_length = 64  # tokenizer max lenght
    train_batch_size = 64
    val_batch_size = 64
    input_size = 768
    hidden_size = 768
    
    #--------------------------------
    #  GAN-BERT specific parameters
    #--------------------------------
    # number of hidden layers in the generator, 
    # each of the size of the output space
    generator_noise_size = 100
    num_hidden_layers_g = 1; 
    # number of hidden layers in the discriminator, 
    # each of the size of the input spclassace
    num_hidden_layers_d = 1; 
    # dropout to be applied to discriminator's input vectors
    out_dropout_rate = 0.2

    # Replicate labeled data to balance poorly represented datasets, 
    # e.g., less than 1% of labeled material
    apply_balance = True

    #  Optimization parameters
    learning_rate_discriminator = 5e-5
    learning_rate_generator = 5e-5
    epsilon = 1e-8
    num_train_epochs = 10

    # Scheduler
    apply_scheduler = False
    warmup_proportion = 0.1


    labeled_file = "./data/labeled.tsv"
    unlabeled_file = "./data/unlabeled.tsv"
    test_filename = "./data/test.tsv"

    label_list = ["UNK_UNK","ABBR_abb", "ABBR_exp", "DESC_def", "DESC_desc", 
                "DESC_manner", "DESC_reason", "ENTY_animal", "ENTY_body", 
                "ENTY_color", "ENTY_cremat", "ENTY_currency", "ENTY_dismed", 
                "ENTY_event", "ENTY_food", "ENTY_instru", "ENTY_lang", 
                "ENTY_letter", "ENTY_other", "ENTY_plant", "ENTY_product", 
                "ENTY_religion", "ENTY_sport", "ENTY_substance", "ENTY_symbol", 
                "ENTY_techmeth", "ENTY_termeq", "ENTY_veh", "ENTY_word", "HUM_desc", 
                "HUM_gr", "HUM_ind", "HUM_title", "LOC_city", "LOC_country", 
                "LOC_mount", "LOC_other", "LOC_state", "NUM_code", "NUM_count", 
                "NUM_date", "NUM_dist", "NUM_money", "NUM_ord", "NUM_other", 
                "NUM_perc", "NUM_period", "NUM_speed", "NUM_temp", "NUM_volsize", 
                "NUM_weight"]

    label2class = {cls: idx for idx, cls in enumerate(label_list)}


cfg = Config()

In [5]:
from training import Trainer
from modeling import Discriminator, Generator
from tensor_dataset import (
    process_all_data, get_qc_examples_from_file, get_tensor_dataset
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(cfg.encoder_model_name)

loading configuration file config.json from cache at /home/jahid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /home/jahid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading f

In [7]:
#Load the examples
labeled_examples = get_qc_examples_from_file(cfg.labeled_file)
unlabeled_examples = get_qc_examples_from_file(cfg.unlabeled_file)
test_examples = get_qc_examples_from_file(cfg.test_filename)

labeled_examples[:2],unlabeled_examples[:2], test_examples[:2]

([("How many pitchers occupy the shelf beside the crouching woman in Edgar Degas 's 1886 painting The Tub ?",
   'NUM_count'),
  ('Which side of the face do most artists tend to show more of in self-portraits ?',
   'ENTY_other')],
 [("What film ends with the line : `` This is Mrs. Norman Maine '' ?",
   'UNK_UNK'),
  ('What is the average salary of a paleontologist ?', 'UNK_UNK')],
 [('How far is it from Denver to Aspen ?', 'NUM_dist'),
  ('"What county is Modesto ', 'LOC_city')])

In [8]:
train_examples = labeled_examples
train_label_masks = np.ones(len(labeled_examples), dtype=bool)

if unlabeled_examples:
  train_examples = train_examples + unlabeled_examples
  tmp_masks = np.zeros(len(unlabeled_examples), dtype=bool)
  train_label_masks = np.concatenate([train_label_masks,tmp_masks])

train_data = process_all_data(train_examples, train_label_masks, balance_label_examples=True)

train_dataloader = get_tensor_dataset(
  train_data,
  label_mapper=cfg.label2class,
  tokenizer=tokenizer,
  max_seq_length=cfg.max_seq_length,
  batch_size=cfg.train_batch_size
)

Total Data: 5452, Labeled Data: 109, Masking Ratio: 0.01999266324284666


In [9]:
test_label_masks = np.ones(len(test_examples), dtype=bool)
test_data = process_all_data(test_examples, test_label_masks, balance_label_examples=False)

test_dataloader = get_tensor_dataset(
  test_data,
  label_mapper=cfg.label2class,
  tokenizer=tokenizer,
  max_seq_length=cfg.max_seq_length,
  batch_size=cfg.train_batch_size
)

In [10]:
encoder = AutoModel.from_pretrained(cfg.encoder_model_name)

discriminator=Discriminator(
    input_size=cfg.input_size, 
    hidden_sizes=[cfg.hidden_size],
    num_labels=len(cfg.label_list)+1,
    dropout_rate=cfg.out_dropout_rate
)

generator = Generator(
    noise_size=cfg.generator_noise_size,
    output_size=cfg.hidden_size,
    hidden_sizes=[cfg.hidden_size],
    dropout_rate=cfg.out_dropout_rate
)

loading configuration file config.json from cache at /home/jahid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home/jahid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/pyt

In [11]:
generator, discriminator

(Generator(
   (layers): Sequential(
     (0): Linear(in_features=100, out_features=768, bias=True)
     (1): LeakyReLU(negative_slope=0.2, inplace=True)
     (2): Dropout(p=0.2, inplace=False)
     (3): Linear(in_features=768, out_features=768, bias=True)
   )
 ),
 Discriminator(
   (input_dropout): Dropout(p=0.2, inplace=False)
   (layers): Sequential(
     (0): Linear(in_features=768, out_features=768, bias=True)
     (1): LeakyReLU(negative_slope=0.2, inplace=True)
     (2): Dropout(p=0.2, inplace=False)
   )
   (logit): Linear(in_features=768, out_features=52, bias=True)
   (softmax): Softmax(dim=-1)
 ))

In [12]:
trainer =  Trainer(
    config=cfg,
    encoder=encoder,
    discriminator=discriminator,
    generator=generator,
    train_loader=train_dataloader,
    val_loader=test_dataloader,
    device="cuda"
)
trainer.to_device()
trainer.configure_optimizer()


In [13]:
trainer.train(epochs=25)

  0%|          | 0/25 [00:00<?, ?it/s]

Training Epoch: 0


92it [01:00,  1.51it/s]


Train Metric:  {'generator_loss': 0.5601518516955168, 'discriminator_loss': 4.763273303923399}


8it [00:01,  7.73it/s]


Val Metric:  {'validation_accuracy': 0.364, 'validation_loss': 2.9671733379364014}


  4%|▍         | 1/25 [01:03<25:14, 63.10s/it]

Training Epoch: 1


92it [01:00,  1.52it/s]


Train Metric:  {'generator_loss': 0.770343665195548, 'discriminator_loss': 2.768962702025538}


8it [00:00,  8.03it/s]


Val Metric:  {'validation_accuracy': 0.492, 'validation_loss': 2.3955118656158447}


  8%|▊         | 2/25 [02:05<24:03, 62.76s/it]

Training Epoch: 2


92it [01:00,  1.52it/s]


Train Metric:  {'generator_loss': 0.7469146186890809, 'discriminator_loss': 1.9927453554194907}


8it [00:01,  7.94it/s]


Val Metric:  {'validation_accuracy': 0.54, 'validation_loss': 2.2363500595092773}


 12%|█▏        | 3/25 [03:08<23:00, 62.74s/it]

Training Epoch: 3


92it [01:01,  1.51it/s]


Train Metric:  {'generator_loss': 0.7338307759036189, 'discriminator_loss': 1.5411683081284813}


8it [00:01,  7.72it/s]


Val Metric:  {'validation_accuracy': 0.584, 'validation_loss': 2.022867441177368}


 16%|█▌        | 4/25 [04:11<22:02, 62.96s/it]

Training Epoch: 4


92it [01:00,  1.51it/s]


Train Metric:  {'generator_loss': 0.7309363568606584, 'discriminator_loss': 1.2716243688179099}


8it [00:01,  7.79it/s]


Val Metric:  {'validation_accuracy': 0.594, 'validation_loss': 2.005647659301758}


 20%|██        | 5/25 [05:14<21:00, 63.04s/it]

Training Epoch: 5


92it [01:01,  1.51it/s]


Train Metric:  {'generator_loss': 0.7241552640562472, 'discriminator_loss': 1.0425708235605904}


8it [00:01,  7.87it/s]


Val Metric:  {'validation_accuracy': 0.628, 'validation_loss': 1.9262101650238037}


 24%|██▍       | 6/25 [06:18<19:59, 63.12s/it]

Training Epoch: 6


92it [01:00,  1.52it/s]


Train Metric:  {'generator_loss': 0.7212932517995005, 'discriminator_loss': 0.9118098396322002}


8it [00:01,  7.81it/s]


Val Metric:  {'validation_accuracy': 0.618, 'validation_loss': 2.0432677268981934}


 28%|██▊       | 7/25 [07:20<18:54, 63.00s/it]

Training Epoch: 7


92it [01:00,  1.52it/s]


Train Metric:  {'generator_loss': 0.7190649768580561, 'discriminator_loss': 0.857661516122196}


8it [00:01,  7.78it/s]


Val Metric:  {'validation_accuracy': 0.64, 'validation_loss': 2.0537848472595215}


 32%|███▏      | 8/25 [08:23<17:48, 62.87s/it]

Training Epoch: 8


