In [9]:
import json
from random import shuffle
from collections import Counter
import torch

import time
import logging
import argparse
import os
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import random
import numpy as np

from torch.nn.parameter import Parameter, UninitializedParameter
import torch

from torch import Tensor
from torch.nn import init
import math
from math import gcd
from math import sqrt
from torch.nn import functional as F

class PHMLayer(nn.Module):

  def __init__(self, in_features, out_features,n=2):
    super(PHMLayer, self).__init__()
    self.n = n
    self.in_features = in_features
    self.out_features = out_features

    self.bias = Parameter(torch.Tensor(out_features))

    self.a = torch.zeros((n, n, n))
    self.a = Parameter(torch.nn.init.xavier_uniform_(self.a))

    self.s = torch.zeros((n, self.out_features//n, self.in_features//n)) 
    self.s = Parameter(torch.nn.init.xavier_uniform_(self.s))

    self.weight = torch.zeros((self.out_features, self.in_features))

    fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
    bound = 1 / math.sqrt(fan_in)
    init.uniform_(self.bias, -bound, bound)

  def kronecker_product1(self, a, b):

    siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:]))
    
    res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4)
    siz0 = res.shape[:-4]
    out = res.reshape(siz0 + siz1)

    return out

  def forward(self, input: Tensor) -> Tensor:
    self.weight = torch.sum(self.kronecker_product1(self.a, self.s), dim=0)

    input = input.type(dtype=self.weight.type())

      
    return F.linear(input, weight=self.weight, bias=self.bias)

  def extra_repr(self) -> str:
    return 'in_features={}, out_features={}, bias={}'.format(
      self.in_features, self.out_features, self.bias is not None)
    
  def reset_parameters(self) -> None:
    init.kaiming_uniform_(self.a, a=math.sqrt(5))
    init.kaiming_uniform_(self.s, a=math.sqrt(5))
    fan_in, _ = init._calculate_fan_in_and_fan_out(self.placeholder)
    bound = 1 / math.sqrt(fan_in)
    init.uniform_(self.bias, -bound, bound)

In [10]:
from torch import nn
from transformers import RobertaTokenizer,RobertaModel
class Model_Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_labels, dropout):
        super(Model_Classifier, self).__init__()
        # Instantiate BERT model
        self.bert = RobertaModel.from_pretrained('roberta-large')
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        self.dropout = dropout
        self.linear = nn.Linear(self.embedding_dim,self.num_labels)
       
        


    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state = outputs[0][:, 0, :]
        
        logits = self.linear(last_hidden_state)


        #logits = self.classifier(last_hidden_state_cls)

        return logits, last_hidden_state
class QModel_Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_labels, dropout,feature_remove_max= False):
        super(QModel_Classifier, self).__init__()
        # Instantiate BERT model
        self.bert = RobertaModel.from_pretrained('roberta-large')
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        self.dropout = dropout

        
        divisors = sorted(self.cf(embedding_dim,hidden_dim))
        divisors1 = sorted(self.cf(hidden_dim,num_labels))
        common_divisors = sorted(set(divisors1) & set(divisors))
        if(feature_remove_max == True):
            self.n = common_divisors[-1]
        else :
            self.n = common_divisors[0]
        
        self.linear = PHMLayer(self.embedding_dim, self.hidden_dim,self.n)


    def cf(self,num1,num2):
            n=[]
            g=gcd(num1, num2)
            for i in range(1, int(sqrt(g))+1):
                if g%i==0:
                    n.append(i)
                    if g!=i*i:
                        n.append(int(g/i))
            return n

    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task

        last_hidden_state_cls = outputs[0][:, 0, :]
        #print(last_hidden_state_cls.shape)
        logits = self.linear(last_hidden_state_cls)

        

        #print(logits.shape)
        # Feed input to classifier to compute logits
        #logits = self.classifier(last_hidden_state_cls)
        
        return logits, last_hidden_state_cls

In [11]:
def initialize_model(model,embedding=1024,hidden = 16,num_classes = 2):
    """Initialize the Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    model_classifier = model(embedding, hidden, num_classes, dropout=0.1)

    # Tell PyTorch to run the model on GPU
    model_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(model_classifier.parameters(),
                      lr=1e-5,  # Default learning rate
                      eps=1e-8  # Default epsilon value
                      )

    return model_classifier, optimizer


In [12]:
from transformers import RobertaModel,RobertaTokenizer
class config():
    def __init__(self):
        """
        :param args:
        """
        super(config, self).__init__()
        self.data = None
        self.bert_model = 'roberta-large'
        self.num_labels = 0
        self.epoch = 20
        self.k_spt = 5
        self.k_qry = 2
        self.outer_batch_size = 2
        self.inner_batch_size = 12
        self.outer_update_lr = 5e-5
        self.inner_update_lr = 5e-5
        self.inner_update_step = 10
        self.inner_update_step_eval = 40
        self.num_task_train = 500
        self.num_task_test = 3


In [13]:
from torch import nn
from math import gcd
from math import sqrt
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import RobertaForSequenceClassification
from copy import deepcopy
import gc
import torch
from sklearn.metrics import accuracy_score
import numpy as np

class Learner(nn.Module):
    """
    Meta Learner
    """
    def __init__(self, args):
        """
        :param args:
        """
        super(Learner, self).__init__()
        
        self.num_labels = args.num_labels
        self.outer_batch_size = args.outer_batch_size
        self.inner_batch_size = args.inner_batch_size
        self.outer_update_lr  = args.outer_update_lr
        self.inner_update_lr  = args.inner_update_lr
        self.inner_update_step = args.inner_update_step
        self.inner_update_step_eval = args.inner_update_step_eval
        self.bert_model = args.bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.loss_fn = nn.CrossEntropyLoss()
        self.model , self.outer_optimizer = initialize_model(QModel_Classifier,1024,16,self.num_labels)

        
        self.model.train()

    def forward(self, batch_tasks, training = True):
        """
        batch = [(support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset)]
        
        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        """
        task_accs = []
        task_loss = []
        sum_gradients = []
        num_task = len(batch_tasks)
        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval

        for task_id, task in enumerate(batch_tasks):
            support = task[0]
            query   = task[1]
            
            fast_model = deepcopy(self.model)
            fast_model.to(self.device)
            support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                            batch_size=self.inner_batch_size)
            
            inner_optimizer = deepcopy(self.outer_optimizer)
            fast_model.train()
            
            print('----Task',task_id, '----')
            for i in range(0,num_inner_update_step):
                all_loss = []
                for inner_step, batch in enumerate(support_dataloader):
                    
                    batch = tuple(t.to(self.device) for t in batch)
                    input_ids, attention_mask, segment_ids, label_id = batch
     
                    
                    logits, hiden_state = fast_model(input_ids, attention_mask)


                    #print(logits)
                    #print(label_id)
                    
                    loss = self.loss_fn(logits, label_id)    
                    #loss = outputs[0]              
                    loss.backward()
                    inner_optimizer.step()
                    inner_optimizer.zero_grad()
                    
                    all_loss.append(loss.item())
                
                if i % 4 == 0:
                    print("Inner Loss: ", np.mean(all_loss))

            query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
            
            #iter and next seperate
            query_iter = iter(query_dataloader)
            query_batch = next(query_iter)
            query_batch = tuple(t.to(self.device) for t in query_batch)
            q_input_ids, q_attention_mask, q_segment_ids, q_label_id = query_batch
            logits, hiden_state = fast_model(q_input_ids, q_attention_mask)
            
            if training:
                q_loss = self.loss_fn(logits, q_label_id)

                q_loss.backward()
                fast_model.to(torch.device('cpu'))

                # None parameter layers are removed
                for i, params in enumerate(fast_model.parameters()):
                    if task_id == 0:
                        sum_gradients.append(deepcopy(params.grad))
                    else:
                        if(params.grad == None ):
                            pass
                        else:
                            sum_gradients[i] += deepcopy(params.grad)
            
            q_logits = F.softmax(logits,dim=1)
            pre_label_id = torch.argmax(q_logits,dim=1)
            pre_label_id = pre_label_id.detach().cpu().numpy().tolist()
            q_label_id = q_label_id.detach().cpu().numpy().tolist()
            
            acc = accuracy_score(pre_label_id,q_label_id)
            task_accs.append(acc)

            q_loss = self.loss_fn(logits, q_label_id)
            task_loss.append(q_loss)
            
            del fast_model, inner_optimizer
            torch.cuda.empty_cache()
        
        if training:
            # Average gradient across tasks
            for i in range(0,len(sum_gradients)):
                if(sum_gradients[i] == None ):
                            pass
                else:
                    sum_gradients[i] = sum_gradients[i] / float(num_task)

            #Assign gradient for original model, then using optimizer to update its weights
            for i, params in enumerate(self.model.parameters()):
                params.grad = sum_gradients[i]


            self.outer_optimizer.step()
            self.outer_optimizer.zero_grad()
            
            del sum_gradients
            gc.collect()
        
        return np.mean(task_accs) , np.mean(task_loss)

In [14]:

from torch.utils.data import Dataset
import collections
import random
import json, pickle

class MetaTask(Dataset):
    
    def __init__(self, examples, num_task, k_support, k_query, tokenizer):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of support sample per task
        :param k_query: number of query sample per task
        """
        self.examples = examples
        random.shuffle(self.examples)
        
        self.num_task = num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        self.max_seq_length = 256
        self.create_batch(self.num_task)
    
    def create_batch(self, num_task):
        self.supports = []  # support set
        self.queries = []  # query set
        
        for b in range(num_task):  # for each task
            # 1.select domain randomly
            domain = random.choice(self.examples)['label']
            domainExamples = [e for e in self.examples if e['label'] == domain]
            
            # 1.select k_support + k_query examples from domain randomly
            selected_examples = random.sample(domainExamples,self.k_support + self.k_query)
            random.shuffle(selected_examples)
            exam_train = selected_examples[:self.k_support]
            exam_test  = selected_examples[self.k_support:]
            
            self.supports.append(exam_train)
            self.queries.append(exam_test)

    def create_feature_set(self,examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_segment_ids    = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for id_,example in enumerate(examples):
            input_ids = self.tokenizer.encode(example['sentence'])
            attention_mask = [1] * len(input_ids)
            segment_ids    = [0] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                attention_mask.append(0)
                segment_ids.append(0)

            label_id = example['label']

            
            all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
            all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
            all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
            all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)

        tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)  
        return tensor_set
    
    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        return support_set, query_set

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return self.num_task

In [15]:
def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]


if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    with torch.no_grad():
            logits, _ = model(b_input_ids, b_attn_mask)

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        predict += preds.tolist()
        y_true += b_labels.tolist()

    # plot heatmap
    test_accuracy = np.mean(test_accuracy)
    cm = confusion_matrix(y_true, predict)
    plt.figure(figsize=(10, 7))
    sn.heatmap(cm, annot=True)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

    # Accuracy
    print(f'Accuracy: {accuracy_score(y_true, predict)}')

    # Recall
    print(f'Recall: {recall_score(y_true, predict, average=None)}')

    # Precision
    print(f'Precision: {precision_score(y_true, predict, average=None)}')

    # F1_score
    print(f'F1_score: {f1_score(y_true, predict, average=None)}')

    return accuracy_score(y_true, predict)

No GPU available, using the CPU instead.


In [None]:
from tqdm import tqdm
def test_evaluate(model,model_path, test_dataloader,hidden=16,num_labels=2,feature_remove_max=False):
    """After the completion of each training epoch, measure the model's performance
    on our vtest set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    if(model == QModel_Classifier):
        model = model(1024,hidden, num_labels=num_classes, dropout=0.1,feature_remove_max=feature_remove_max)
    else:
        model = model(1024,hidden, num_labels=num_classes, dropout=0.1)

    model.load_state_dict(torch.load(model_path))
    model.eval()

    # Tracking variables
    test_accuracy = []
    predict = []
    y_true = []

    # For each batch in our test set...
    for batch in tqdm(test_dataloader):
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data, MAX_LEN=256):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,  # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,  # Max length to truncate/pad
            pad_to_max_length=True,  # Pad sentence to max length
            return_attention_mask=True  # Return attention mask
        )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks
def prepare_data(test_path, tsv = False):
    if(tsv == True) : 
        test_df = pd.read_csv(train_path, sep='\t')

    else:
        test_df = pd.read_csv(train_path)

    test_text = test_df["sentence"]
    test_label = test_df["label"]
    for i in range(len(test_text)):
        test_text[i] = text_preprocessing(test_text[i])
        
    test_inputs, test_masks = preprocessing_for_bert(test_text, MAX_LEN)
    test_labels = torch.tensor(test_label)
    
    batch_size = 64
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return test_dataloader

In [None]:
import warnings
import os
import pandas as pd
from transformers import AdamW
warnings.filterwarnings("ignore", category=UserWarning)
def main():
    args = config()
    tokenizer = RobertaTokenizer.from_pretrained(args.bert_model, do_lower_case = True)
    tsv = False
    best_validation_loss = float('inf')
    model_path = r"fewshot_qmodel.pt"
    
    train_path = "trec/train.tsv"
    valid_path = "trec/dev.tsv"
    test_path = 'trec/test.tsv'

    if(train_path[-3:]=='tsv'):
        tsv = True
    
    if(tsv == True) : 
        train_ds = pd.read_csv(train_path, sep='\t')
        val_ds = pd.read_csv(valid_path, sep='\t')
    else:
        train_ds = pd.read_csv(train_path)
        val_ds = pd.read_csv(valid_path)

    args.num_labels = len(val_ds.label.unique())

    #initialize model 
    learner = Learner(args)

    
    test_examples = []

    for index, row in list(val_ds.iterrows()):
        test_examples.append(dict(row))


    train_examples = []

    for index, row in list(train_ds.iterrows()):
        train_examples.append(dict(row))
        
    test = MetaTask(test_examples, num_task = args.num_task_test, k_support=args.k_spt, 
                    k_query=args.k_qry, tokenizer = tokenizer)
    
    global_step = 0
    for epoch in range(args.epoch):
    
        train = MetaTask(train_examples, num_task = args.num_task_train, k_support=args.k_spt, 
                         k_query=args.k_qry, tokenizer = tokenizer)
    
        db = create_batch_of_tasks(train, is_shuffle = True, batch_size = args.outer_batch_size)
    
        for step, task_batch in enumerate(db):
    
            acc,_ = learner(task_batch)
    
            print('Step:', step, '\ttraining Acc:', acc)
    
            if global_step % 20 == 0:
                random_seed(123)
                print("\n-----------------Testing Mode-----------------\n")
                db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
                acc_all_test = []
                loss_all_test = []
    
                for test_batch in db_test:
                    acc,loss = learner(test_batch, training = False)
                    acc_all_test.append(acc)
                    loss_all_test.append(loss)
                    
                print('Step:', step, 'Test F1:', np.mean(acc_all_test))
    
                random_seed(int(time.time() % 10))
                
                val_loss = np.mean(acc_all_test)
                if (val_loss < best_validation_loss):
                best_validation_loss = val_loss
                torch.save(model.state_dict(), model_path)
                
            global_step += 1

    test_dataloader = prepare_data(test_path, tsv = tsv)
    test_accuracy = test_evaluate(QModel_Classifier,model_path, test_dataloader,hidden=16,num_labels=args.num_labels)
if __name__ == "__main__":
    main()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


----Task 0 ----
Inner Loss:  2.445026397705078
Inner Loss:  2.431630849838257
Inner Loss:  2.4546172618865967
----Task 1 ----
Inner Loss:  2.297107219696045
Inner Loss:  2.3064944744110107
Inner Loss:  2.2736544609069824
Step: 0 	training Acc: 0.0

-----------------Testing Mode-----------------

----Task 0 ----
Inner Loss:  2.2386443614959717
Inner Loss:  2.227292537689209
Inner Loss:  2.184762954711914
Inner Loss:  2.333995819091797
Inner Loss:  2.3043158054351807
Inner Loss:  2.3041958808898926
Inner Loss:  2.2687318325042725
Inner Loss:  2.277085542678833
Inner Loss:  2.250636577606201
Inner Loss:  2.2154786586761475
----Task 0 ----
Inner Loss:  3.4664204120635986
Inner Loss:  3.428532123565674
Inner Loss:  3.4227397441864014
Inner Loss:  3.4421677589416504
Inner Loss:  3.46010160446167
Inner Loss:  3.4243366718292236
Inner Loss:  3.4100594520568848
Inner Loss:  3.4320316314697266
Inner Loss:  3.5647244453430176
Inner Loss:  3.4286227226257324
----Task 0 ----
Inner Loss:  3.42463302

In [None]:
model , outer_optimizer = initialize_model(Model_Classifier,1024,16,6)
for i, params in enumerate(model.parameters()):
    print(i)
    print(params.shape)

In [None]:
list(model.linear.parameters())

In [None]:
model = RobertaModel.from_pretrained('roberta-large')
for i, params in enumerate(model.parameters()):

    print(i)
    print(params.shape)