In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

raw_datasets = load_dataset("glue", 'mnli', cache_dir="P/data3/NJ/cach")

import os
os.environ['TRANSFORMERS_CACHE'] = '/data3/NJ/cach'


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "FacebookAI/roberta-base"
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.0
config.attention_probs_dropout_prob=0.0
config.num_labels=3
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = DebertaV2ForQuestionAnswering.from_pretrained("/data2/nusrat/work/bert-finetuned-squad_2/checkpoint-47500")



In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [4]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['question1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'])

tokenized_dataset = raw_datasets.map(preprocessing_function, batched=True)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:



import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN

# Define the custom linear layer
class PropulsionLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True, degree=15, **kwargs):
        super(PropulsionLinear, self).__init__()
        # Initialize the underlying nn.Linear with both the specified arguments and any additional kwargs
        self.linear = nn.Linear(input_features, output_features, bias=bias, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(output_features))
        self.degree = degree
 
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        output = self.linear(x) 
        return output.mul_(self.push)
    
class PropulsionEmbedding(nn.Module):
    def __init__(self, degree=15, **kwargs):
        super(PropulsionEmbedding, self).__init__()
        # Initialize the embedding layer with kwargs passed to the constructor
        self.embeddings = nn.Embedding(**kwargs)
        # Assuming embedding_dim is one of the kwargs, use it to initialize propulsion
        self.propulsion = nn.Parameter(torch.ones(kwargs['embedding_dim']))
        self.degree = degree
        
    @property
    def weight(self):
        return self.embeddings.weight
    
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.embeddings(x)* self.push




class PropulsionLayerNorm(nn.Module):
    def __init__(self, normalized_shape, degree=1, **kwargs):
        super(PropulsionLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(normalized_shape, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(normalized_shape))
        self.degree = degree


    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.layer_norm(x)* self.push

def replace_layers_with_custom(model, linear_degree=35, embedding_degree=35):
    """
    Recursively replaces nn.Linear and nn.Embedding layers with CustomLinear
    and CustomEmbedding layers, copying the weights and setting the degrees.
    """
    for name, module in model.named_children():
        # Replace nn.Linear with CustomLinear
        if isinstance(module, nn.Linear):
            custom_linear = custom_linear = PropulsionLinear(module.in_features, module.out_features, module.bias is not None, degree=linear_degree)
            custom_linear.linear.weight = nn.Parameter(module.weight.data.clone())
            if module.bias is not None:
                custom_linear.linear.bias = nn.Parameter(module.bias.data.clone())
            setattr(model, name, custom_linear)
        # Replace nn.Embedding with CustomEmbedding
        elif isinstance(module, nn.Embedding):
            custom_embedding = PropulsionEmbedding(num_embeddings=module.num_embeddings, embedding_dim=module.embedding_dim, padding_idx=module.padding_idx, degree=embedding_degree)
            custom_embedding.embeddings.weight = nn.Parameter(module.weight.data.clone())
            setattr(model, name, custom_embedding)

        else:
            # Recursively apply this function to children modules
            replace_layers_with_custom(module, linear_degree=linear_degree, embedding_degree=embedding_degree)


# Load a pretrained BERT model
#model = BertModel.from_pretrained('bert-base-uncased')


# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained(model_name, config=config)
replace_layers_with_custom(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Freeze all layers by default
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze specific layers by name
for name, param in model.named_parameters():
    if 'propulsion' in name:
        param.requires_grad = True



# Count of trainable parameters
total_trainable_params = 0
total =  0
# Print trainable parameters and count their total number
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}, Shape: {param.shape}")
        
        total_trainable_params += param.numel()
    total+=param.numel()

print(f"Total trainable parameters:{total_trainable_params}, percentage:  {total_trainable_params/total}")



Parameter name: roberta.embeddings.word_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.position_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.token_type_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.key.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.value.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.intermediate.dense.propulsion, Shape: torch.Size([3072])
Parameter name: roberta.encoder.layer.0.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention

In [7]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='mnli_dir',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.0005,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    logging_steps=500,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation_matched"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-08-16 18:44:08,350] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nprottas/miniconda3/envs/up/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
500,0.9116,0.594583,0.767674,0.762923,0.763268,0.764238
1000,0.5738,0.517519,0.803135,0.79979,0.800026,0.799898
1500,0.5267,0.485095,0.808861,0.808453,0.808562,0.809475
2000,0.497,0.469258,0.816838,0.816979,0.816684,0.817422
2500,0.4845,0.461597,0.822826,0.820793,0.820136,0.819969
3000,0.4641,0.442656,0.829348,0.827682,0.828023,0.82863
3500,0.4707,0.434057,0.83007,0.827278,0.827433,0.829954
4000,0.4487,0.431835,0.831159,0.830456,0.83019,0.830464
4500,0.4551,0.523697,0.812977,0.79959,0.795964,0.795925
5000,0.454,0.421275,0.835903,0.834264,0.834365,0.836373


TrainOutput(global_step=36816, training_loss=0.3945152704635945, metrics={'train_runtime': 12112.5009, 'train_samples_per_second': 97.264, 'train_steps_per_second': 3.04, 'total_flos': 5.611387454878543e+16, 'train_loss': 0.3945152704635945, 'epoch': 3.0})