In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

raw_datasets = load_dataset("glue", 'stsb')




In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "FacebookAI/roberta-base"
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.1
config.attention_probs_dropout_prob=0.01
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = DebertaV2ForQuestionAnswering.from_pretrained("/data2/nusrat/work/bert-finetuned-squad_2/checkpoint-47500")



In [3]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['sentence1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True,max_length=512)

tokenized_dataset = raw_datasets.map(preprocessing_function, batched=True)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [4]:



import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN

# Define the custom linear layer
class PropulsionLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True, degree=15, **kwargs):
        super(PropulsionLinear, self).__init__()
        # Initialize the underlying nn.Linear with both the specified arguments and any additional kwargs
        self.linear = nn.Linear(input_features, output_features, bias=bias, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(output_features))
        self.degree = degree
 
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.linear(x) * self.push
    
class PropulsionEmbedding(nn.Module):
    def __init__(self, degree=15, **kwargs):
        super(PropulsionEmbedding, self).__init__()
        # Initialize the embedding layer with kwargs passed to the constructor
        self.embeddings = nn.Embedding(**kwargs)
        # Assuming embedding_dim is one of the kwargs, use it to initialize propulsion
        self.propulsion = nn.Parameter(torch.ones(kwargs['embedding_dim']))
        self.degree = degree
        
    @property
    def weight(self):
        return self.embeddings.weight
    
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.embeddings(x)* self.push




class PropulsionLayerNorm(nn.Module):
    def __init__(self, normalized_shape, degree=1, **kwargs):
        super(PropulsionLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(normalized_shape, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(normalized_shape))
        self.degree = degree


    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.layer_norm(x)* self.push

def replace_layers_with_custom(model, linear_degree=55, embedding_degree=55):
    """
    Recursively replaces nn.Linear and nn.Embedding layers with CustomLinear
    and CustomEmbedding layers, copying the weights and setting the degrees.
    """
    for name, module in model.named_children():
        # Replace nn.Linear with CustomLinear
        if isinstance(module, nn.Linear):
            custom_linear = custom_linear = PropulsionLinear(module.in_features, module.out_features, module.bias is not None, degree=linear_degree)
            custom_linear.linear.weight = nn.Parameter(module.weight.data.clone())
            if module.bias is not None:
                custom_linear.linear.bias = nn.Parameter(module.bias.data.clone())
            setattr(model, name, custom_linear)
        # Replace nn.Embedding with CustomEmbedding
        elif isinstance(module, nn.Embedding):
            custom_embedding = PropulsionEmbedding(num_embeddings=module.num_embeddings, embedding_dim=module.embedding_dim, padding_idx=module.padding_idx, degree=embedding_degree)
            custom_embedding.embeddings.weight = nn.Parameter(module.weight.data.clone())
            setattr(model, name, custom_embedding)

        else:
            # Recursively apply this function to children modules
            replace_layers_with_custom(module, linear_degree=linear_degree, embedding_degree=embedding_degree)


# Load a pretrained BERT model
#model = BertModel.from_pretrained('bert-base-uncased')


# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)
replace_layers_with_custom(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Freeze all layers by default
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze specific layers by name
for name, param in model.named_parameters():
    if 'propulsion' in name or 'classifier' in name:
        param.requires_grad = True



# Count of trainable parameters
total_trainable_params = 0
total =  0
# Print trainable parameters and count their total number
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}, Shape: {param.shape}")
        
        total_trainable_params += param.numel()
    total+=param.numel()

print(f"Total trainable parameters:{total_trainable_params}, percentage:  {total_trainable_params/total}")

#677378

Parameter name: roberta.embeddings.word_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.position_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.token_type_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.key.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.value.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.intermediate.dense.propulsion, Shape: torch.Size([3072])
Parameter name: roberta.encoder.layer.0.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention

In [6]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr
def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    return {"MSE": ((preds - pred.label_ids) ** 2).mean().item(),
            "RMSE": (np.sqrt (( (preds - pred.label_ids) ** 2).mean())).item(),
            "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
            "Pearson" : pearsonr(preds,pred.label_ids)[0],
            "Spearman's Rank":spearmanr(preds,pred.label_ids)[0]
           }

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='qnli_dir',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.0,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=100,
    logging_steps=100,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-08-17 23:50:08,682] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nprottas/miniconda3/envs/up/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
100,5.3347,2.478434,2.478434,1.574304,1.293312,0.200492,0.178777
200,2.0597,2.123234,2.123234,1.457132,1.186612,0.472026,0.472459
300,1.2926,0.814945,0.814945,0.902743,0.738605,0.81577,0.818754
400,0.8392,0.787699,0.787699,0.887524,0.715031,0.84434,0.84663
500,0.7346,0.655569,0.655569,0.809672,0.640056,0.85484,0.856543
600,0.6455,0.590536,0.590536,0.768463,0.598398,0.868941,0.872503
700,0.6268,0.600278,0.600278,0.774776,0.601503,0.867985,0.870145
800,0.5462,0.684404,0.684404,0.827287,0.645093,0.875111,0.878117
900,0.5377,0.551606,0.551606,0.742702,0.56401,0.876764,0.880155
1000,0.5541,0.576729,0.576729,0.759427,0.582932,0.88192,0.882295


TrainOutput(global_step=3600, training_loss=0.6667604849073622, metrics={'train_runtime': 321.2296, 'train_samples_per_second': 178.969, 'train_steps_per_second': 11.207, 'total_flos': 1876859850634716.0, 'train_loss': 0.6667604849073622, 'epoch': 10.0})

In [8]:

trainer.train()

Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
100,0.4053,0.487045,0.487045,0.697886,0.524215,0.89815,0.897228
200,0.3865,0.530583,0.530583,0.728411,0.558373,0.893471,0.896118
300,0.4115,0.512278,0.512278,0.715736,0.54467,0.897144,0.89881
400,0.4004,0.562799,0.562799,0.750199,0.58156,0.900187,0.900436
500,0.3742,0.482679,0.482679,0.694751,0.523241,0.899293,0.899643
600,0.3675,0.494487,0.494487,0.703197,0.536856,0.901255,0.900868
700,0.3642,0.519585,0.519585,0.720822,0.55199,0.900915,0.899463
800,0.3187,0.50071,0.50071,0.707608,0.544365,0.900197,0.901011
900,0.3588,0.441971,0.441971,0.664809,0.502411,0.901081,0.901539
1000,0.3803,0.460592,0.460592,0.678669,0.51377,0.899814,0.898845


TrainOutput(global_step=3600, training_loss=0.32084823767344156, metrics={'train_runtime': 310.9509, 'train_samples_per_second': 184.884, 'train_steps_per_second': 11.577, 'total_flos': 1876859850634716.0, 'train_loss': 0.32084823767344156, 'epoch': 10.0})