In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

raw_datasets = load_dataset("glue", 'stsb')




In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "FacebookAI/roberta-base"
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.1
config.attention_probs_dropout_prob=0.01
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = DebertaV2ForQuestionAnswering.from_pretrained("/data2/nusrat/work/bert-finetuned-squad_2/checkpoint-47500")



In [3]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['sentence1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True,max_length=512)

tokenized_dataset = raw_datasets.map(preprocessing_function, batched=True)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [4]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN

# Define the custom linear layer
class PropulsionLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True, degree=15, **kwargs):
        super(PropulsionLinear, self).__init__()
        # Initialize the underlying nn.Linear with both the specified arguments and any additional kwargs
        self.linear = nn.Linear(input_features, output_features, bias=bias, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(output_features))
        self.degree = degree
 
    def forward(self, x):
        push = torch.pow(self.propulsion, self.degree)
        return torch.mul(self.linear(x), push)
    
class PropulsionEmbedding(nn.Module):
    def __init__(self, degree=15, **kwargs):
        super(PropulsionEmbedding, self).__init__()
        # Initialize the embedding layer with kwargs passed to the constructor
        self.embeddings = nn.Embedding(**kwargs)
        # Assuming embedding_dim is one of the kwargs, use it to initialize propulsion
        self.propulsion = nn.Parameter(torch.ones(kwargs['embedding_dim']))
        self.degree = degree
        
    @property
    def weight(self):
        return self.embeddings.weight
    
    def forward(self, x):
        push = torch.pow(self.propulsion, self.degree)
        return torch.mul(self.embeddings(x), push)




class PropulsionLayerNorm(nn.Module):
    def __init__(self, normalized_shape, degree=1, **kwargs):
        super(PropulsionLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(normalized_shape, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(normalized_shape))
        self.degree = degree


    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.layer_norm(x)* self.push

def replace_layers_with_custom(model, linear_degree=55, embedding_degree=55):
    """
    Recursively replaces nn.Linear and nn.Embedding layers with CustomLinear
    and CustomEmbedding layers, copying the weights and setting the degrees.
    """
    for name, module in model.named_children():
        # Replace nn.Linear with CustomLinear
        if isinstance(module, nn.Linear) and (name == 'query' or name == 'value' or name =="key"):
            custom_linear = custom_linear = PropulsionLinear(module.in_features, module.out_features, module.bias is not None, degree=linear_degree)
            custom_linear.linear.weight = nn.Parameter(module.weight.data.clone())
            if module.bias is not None:
                custom_linear.linear.bias = nn.Parameter(module.bias.data.clone())
            setattr(model, name, custom_linear)
        # Replace nn.Embedding with CustomEmbedding
        elif isinstance(module, nn.Embedding) and (name == 'dquery' or name == 'dvalue'):
            custom_embedding = PropulsionEmbedding(num_embeddings=module.num_embeddings, embedding_dim=module.embedding_dim, padding_idx=module.padding_idx, degree=embedding_degree)
            custom_embedding.embeddings.weight = nn.Parameter(module.weight.data.clone())
            setattr(model, name, custom_embedding)

        else:
            # Recursively apply this function to children modules
            replace_layers_with_custom(module, linear_degree=linear_degree, embedding_degree=embedding_degree)


# Load a pretrained BERT model
#model = BertModel.from_pretrained('bert-base-uncased')


# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained(model_name, config = config)
replace_layers_with_custom(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Freeze all layers by default
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze specific layers by name
for name, param in model.named_parameters():
    if 'intermediate.dense.propulsion' in name or 'classifier' in name:
        param.requires_grad = True



# Count of trainable parameters
total_trainable_params = 0
total =  0
# Print trainable parameters and count their total number
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}, Shape: {param.shape}")
        
        total_trainable_params += param.numel()
    total+=param.numel()

print(f"Total trainable parameters:{total_trainable_params}, percentage:  {total_trainable_params/total}")

#677378

Parameter name: classifier.dense.weight, Shape: torch.Size([768, 768])
Parameter name: classifier.dense.bias, Shape: torch.Size([768])
Parameter name: classifier.out_proj.weight, Shape: torch.Size([1, 768])
Parameter name: classifier.out_proj.bias, Shape: torch.Size([1])
Total trainable parameters:591361, percentage:  0.004743256553735573


In [6]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr
def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    return {"MSE": ((preds - pred.label_ids) ** 2).mean().item(),
            "RMSE": (np.sqrt (( (preds - pred.label_ids) ** 2).mean())).item(),
            "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
            "Pearson" : pearsonr(preds,pred.label_ids)[0],
            "Spearman's Rank":spearmanr(preds,pred.label_ids)[0]
           }

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='qnli_dir',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.0,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=100,
    logging_steps=100,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-08-17 23:29:44,953] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nprottas/miniconda3/envs/up/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
100,7.1721,3.954603,3.954603,1.988618,1.656765,-0.091175,-0.111
200,3.7287,2.443684,2.443684,1.563229,1.348247,-0.092732,-0.121381
300,2.8084,2.285645,2.285645,1.511835,1.303083,-0.093683,-0.13203
400,2.5707,2.332458,2.332458,1.527239,1.298349,-0.089221,-0.127007
500,2.5705,2.307018,2.307018,1.518887,1.295494,-0.083525,-0.123462


In [None]:

trainer.train()

Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
100,0.412,0.523347,0.523347,0.723427,0.544805,0.897757,0.897671
200,0.4053,0.51771,0.51771,0.719521,0.539356,0.893852,0.89636
300,0.4248,0.533262,0.533262,0.730248,0.550391,0.895557,0.897318
400,0.4182,0.480085,0.480085,0.692881,0.525105,0.897591,0.89712
500,0.4072,0.477549,0.477549,0.691049,0.515289,0.897742,0.896342
600,0.4078,0.557626,0.557626,0.746743,0.558162,0.897534,0.895501
700,0.4013,0.532,0.532,0.729383,0.55229,0.896476,0.893685
800,0.348,0.484052,0.484052,0.695739,0.519285,0.896699,0.896244
900,0.3707,0.451589,0.451589,0.672003,0.498328,0.89796,0.897129
1000,0.4144,0.471176,0.471176,0.686423,0.505446,0.89862,0.896015


TrainOutput(global_step=3600, training_loss=0.3436892943912082, metrics={'train_runtime': 305.3379, 'train_samples_per_second': 188.283, 'train_steps_per_second': 11.79, 'total_flos': 1876859850634716.0, 'train_loss': 0.3436892943912082, 'epoch': 10.0})