In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"


import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

raw_datasets = load_dataset("glue", 'qqp', cache_dir="P/data3/NJ/cach")


import os
os.environ['TRANSFORMERS_CACHE'] = '/data3/NJ/cach'

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig


model_name = "FacebookAI/roberta-base"
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.0
config.attention_probs_dropout_prob=0.0
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = DebertaV2ForQuestionAnswering.from_pretrained("/data2/nusrat/work/bert-finetuned-squad_2/checkpoint-47500")



In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 363846
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 40430
    })
    test: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 390965
    })
})

In [4]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['question1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['question1'], examples['question2'])

tokenized_dataset = raw_datasets.map(preprocessing_function, batched=True)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:



import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN

# Define the custom linear layer
class PropulsionLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True, degree=15, **kwargs):
        super(PropulsionLinear, self).__init__()
        # Initialize the underlying nn.Linear with both the specified arguments and any additional kwargs
        self.linear = nn.Linear(input_features, output_features, bias=bias, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(output_features))
        self.degree = degree
 
    def forward(self, x):
        push = torch.pow(self.propulsion, self.degree)
        return torch.mul(self.linear(x), push)
    
class PropulsionEmbedding(nn.Module):
    def __init__(self, degree=15, **kwargs):
        super(PropulsionEmbedding, self).__init__()
        # Initialize the embedding layer with kwargs passed to the constructor
        self.embeddings = nn.Embedding(**kwargs)
        # Assuming embedding_dim is one of the kwargs, use it to initialize propulsion
        self.propulsion = nn.Parameter(torch.ones(kwargs['embedding_dim']))
        self.degree = degree
        
    @property
    def weight(self):
        return self.embeddings.weight
    
    def forward(self, x):
        push = torch.pow(self.propulsion, self.degree)
        return torch.mul(self.embeddings(x), push)




class PropulsionLayerNorm(nn.Module):
    def __init__(self, normalized_shape, degree=1, **kwargs):
        super(PropulsionLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(normalized_shape, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(normalized_shape))
        self.degree = degree


    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.layer_norm(x)* self.push

def replace_layers_with_custom(model, linear_degree=55, embedding_degree=55):
    """
    Recursively replaces nn.Linear and nn.Embedding layers with CustomLinear
    and CustomEmbedding layers, copying the weights and setting the degrees.
    """
    for name, module in model.named_children():
        # Replace nn.Linear with CustomLinear
        if isinstance(module, nn.Linear): #and (name == 'query' or name == 'value' or name =="key"):
            custom_linear = custom_linear = PropulsionLinear(module.in_features, module.out_features, module.bias is not None, degree=linear_degree)
            custom_linear.linear.weight = nn.Parameter(module.weight.data.clone())
            if module.bias is not None:
                custom_linear.linear.bias = nn.Parameter(module.bias.data.clone())
            setattr(model, name, custom_linear)
        # Replace nn.Embedding with CustomEmbedding
        elif isinstance(module, nn.Embedding): #and (name == 'dquery' or name == 'dvalue'):
            custom_embedding = PropulsionEmbedding(num_embeddings=module.num_embeddings, embedding_dim=module.embedding_dim, padding_idx=module.padding_idx, degree=embedding_degree)
            custom_embedding.embeddings.weight = nn.Parameter(module.weight.data.clone())
            setattr(model, name, custom_embedding)

        else:
            # Recursively apply this function to children modules
            replace_layers_with_custom(module, linear_degree=linear_degree, embedding_degree=embedding_degree)


# Load a pretrained BERT model
#model = BertModel.from_pretrained('bert-base-uncased')


# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained(model_name, config = config)
replace_layers_with_custom(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Freeze all layers by default
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze specific layers by name
for name, param in model.named_parameters():
    if 'propulsion' in name:
        param.requires_grad = True



# Count of trainable parameters
total_trainable_params = 0
total =  0
# Print trainable parameters and count their total number
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}, Shape: {param.shape}")
        
        total_trainable_params += param.numel()
    total+=param.numel()

print(f"Total trainable parameters:{total_trainable_params}, percentage:  {total_trainable_params/total}")



Parameter name: roberta.embeddings.word_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.position_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.token_type_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.key.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.value.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.intermediate.dense.propulsion, Shape: torch.Size([3072])
Parameter name: roberta.encoder.layer.0.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention

In [7]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='qqp_dir_degree_5',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.00,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=1000,
    logging_steps=1000,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
    

)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-08-17 15:10:28,773] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nprottas/miniconda3/envs/up/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1000,0.4477,0.351154,0.827089,0.846198,0.832082,0.838363
2000,0.3497,0.377245,0.823039,0.847141,0.818797,0.821717
3000,0.3284,0.315336,0.845091,0.857041,0.849847,0.85768
4000,0.3258,0.35789,0.846534,0.801962,0.815841,0.838313
5000,0.3125,0.302404,0.855961,0.861647,0.858577,0.867326
6000,0.3078,0.304465,0.855667,0.866185,0.860064,0.867747
7000,0.3022,0.293811,0.858622,0.869117,0.863024,0.870566
8000,0.3015,0.294182,0.863398,0.862141,0.86276,0.87252
9000,0.2943,0.28731,0.863325,0.866792,0.86498,0.873757
10000,0.2874,0.289929,0.858727,0.872374,0.864003,0.870838


TrainOutput(global_step=22742, training_loss=0.28239974458952943, metrics={'train_runtime': 3531.2163, 'train_samples_per_second': 206.074, 'train_steps_per_second': 6.44, 'total_flos': 2.5514270509986144e+16, 'train_loss': 0.28239974458952943, 'epoch': 2.0})

In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1000,0.2581,0.29068,0.861848,0.872949,0.866455,0.873708
2000,0.2805,0.300886,0.852753,0.871794,0.858527,0.864259
3000,0.278,0.288636,0.86374,0.87507,0.868426,0.875538
4000,0.2832,0.311305,0.862241,0.836926,0.846618,0.86186
5000,0.2795,0.288246,0.86311,0.862082,0.86259,0.872323
6000,0.2788,0.308238,0.860882,0.870288,0.86493,0.872595
7000,0.2794,0.284637,0.863911,0.875947,0.868805,0.875761
8000,0.2773,0.278475,0.865931,0.873791,0.869428,0.877146
9000,0.2726,0.292498,0.867125,0.863503,0.865239,0.875216
10000,0.2711,0.283995,0.863131,0.881155,0.869122,0.874796


TrainOutput(global_step=22742, training_loss=0.2528213763989724, metrics={'train_runtime': 3525.628, 'train_samples_per_second': 206.401, 'train_steps_per_second': 6.45, 'total_flos': 2.5514270509986144e+16, 'train_loss': 0.2528213763989724, 'epoch': 2.0})