In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset


raw_datasets = load_dataset("glue", 'cola', cache_dir="P/data3/NJ/cach")


import os
os.environ['TRANSFORMERS_CACHE'] = '/data3/NJ/cach'

In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "FacebookAI/roberta-base"
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.1
config.attention_probs_dropout_prob=0.01
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = DebertaV2ForQuestionAnswering.from_pretrained("/data2/nusrat/work/bert-finetuned-squad_2/checkpoint-47500")



In [3]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['sentence1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['sentence'], truncation=True,max_length=512)

tokenized_dataset = raw_datasets.map(preprocessing_function, batched=True)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [4]:



import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN

# Define the custom linear layer
class PropulsionLinear(nn.Module):
    def __init__(self, input_features, output_features, bias=True, degree=15, **kwargs):
        super(PropulsionLinear, self).__init__()
        # Initialize the underlying nn.Linear with both the specified arguments and any additional kwargs
        self.linear = nn.Linear(input_features, output_features, bias=bias, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(output_features))
        self.degree = degree
 
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.linear(x) * self.push
    
class PropulsionEmbedding(nn.Module):
    def __init__(self, degree=15, **kwargs):
        super(PropulsionEmbedding, self).__init__()
        # Initialize the embedding layer with kwargs passed to the constructor
        self.embeddings = nn.Embedding(**kwargs)
        # Assuming embedding_dim is one of the kwargs, use it to initialize propulsion
        self.propulsion = nn.Parameter(torch.ones(kwargs['embedding_dim']))
        self.degree = degree
        
    @property
    def weight(self):
        return self.embeddings.weight
    
    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.embeddings(x)* self.push




class PropulsionLayerNorm(nn.Module):
    def __init__(self, normalized_shape, degree=1, **kwargs):
        super(PropulsionLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(normalized_shape, **kwargs)
        self.propulsion = nn.Parameter(torch.ones(normalized_shape))
        self.degree = degree


    def forward(self, x):
        self.push = torch.pow(self.propulsion, self.degree)
        return self.layer_norm(x)* self.push

def replace_layers_with_custom(model, linear_degree=55, embedding_degree=55):
    """
    Recursively replaces nn.Linear and nn.Embedding layers with CustomLinear
    and CustomEmbedding layers, copying the weights and setting the degrees.
    """
    for name, module in model.named_children():
        # Replace nn.Linear with CustomLinear
        if isinstance(module, nn.Linear):
            custom_linear = custom_linear = PropulsionLinear(module.in_features, module.out_features, module.bias is not None, degree=linear_degree)
            custom_linear.linear.weight = nn.Parameter(module.weight.data.clone())
            if module.bias is not None:
                custom_linear.linear.bias = nn.Parameter(module.bias.data.clone())
            setattr(model, name, custom_linear)
        # Replace nn.Embedding with CustomEmbedding
        elif isinstance(module, nn.Embedding):
            custom_embedding = PropulsionEmbedding(num_embeddings=module.num_embeddings, embedding_dim=module.embedding_dim, padding_idx=module.padding_idx, degree=embedding_degree)
            custom_embedding.embeddings.weight = nn.Parameter(module.weight.data.clone())
            setattr(model, name, custom_embedding)

        else:
            # Recursively apply this function to children modules
            replace_layers_with_custom(module, linear_degree=linear_degree, embedding_degree=embedding_degree)


# Load a pretrained BERT model
#model = BertModel.from_pretrained('bert-base-uncased')


# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained(model_name)
replace_layers_with_custom(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Freeze all layers by default
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze specific layers by name
for name, param in model.named_parameters():
    if 'propulsion' in name:
        param.requires_grad = True



# Count of trainable parameters
total_trainable_params = 0
total =  0
# Print trainable parameters and count their total number
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Parameter name: {name}, Shape: {param.shape}")
        
        total_trainable_params += param.numel()
    total+=param.numel()

print(f"Total trainable parameters:{total_trainable_params}, percentage:  {total_trainable_params/total}")



Parameter name: roberta.embeddings.word_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.position_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.embeddings.token_type_embeddings.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.key.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.self.value.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.attention.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.0.intermediate.dense.propulsion, Shape: torch.Size([3072])
Parameter name: roberta.encoder.layer.0.output.dense.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention.self.query.propulsion, Shape: torch.Size([768])
Parameter name: roberta.encoder.layer.1.attention

In [6]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred  # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    mcc = metrics.matthews_corrcoef(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy, 'mcc': mcc}


In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='qnli_dir',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=7,
    weight_decay=0.0,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=100,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-08-17 00:02:57,917] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/nprottas/miniconda3/envs/up/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy,Mcc
100,0.6808,0.648087,0.345638,0.5,0.40873,0.691275,0.0
200,0.6016,0.555303,0.345638,0.5,0.40873,0.691275,0.0
300,0.5243,0.52553,0.345638,0.5,0.40873,0.691275,0.0
400,0.4926,0.544382,0.787045,0.632952,0.641652,0.762224,0.390709
500,0.4685,0.47103,0.786868,0.734623,0.751391,0.805369,0.518867
600,0.4398,0.504738,0.795695,0.72517,0.744839,0.805369,0.516069
700,0.4146,0.57041,0.816085,0.710562,0.733367,0.805369,0.515967
800,0.3877,0.446696,0.812855,0.761216,0.778863,0.825503,0.571744
900,0.348,0.449785,0.801394,0.758774,0.774068,0.819751,0.558544
1000,0.3556,0.450718,0.806334,0.766538,0.781252,0.824545,0.571488


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1876, training_loss=0.3755145286446187, metrics={'train_runtime': 135.6977, 'train_samples_per_second': 441.105, 'train_steps_per_second': 13.825, 'total_flos': 702708970111056.0, 'train_loss': 0.3755145286446187, 'epoch': 7.0})

In [8]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy,Mcc
100,0.3469,0.458683,0.811497,0.771031,0.786034,0.82838,0.58112
200,0.322,0.493109,0.81305,0.763628,0.780809,0.826462,0.574557
300,0.3196,0.574588,0.807665,0.729331,0.750537,0.811122,0.531252
400,0.3321,0.471776,0.810383,0.761382,0.778387,0.824545,0.569662
500,0.3478,0.535143,0.810376,0.713999,0.736227,0.805369,0.515442
600,0.3023,0.541377,0.806284,0.752759,0.770544,0.819751,0.556475
700,0.3067,0.672131,0.826402,0.693511,0.715975,0.799616,0.502643
800,0.3016,0.483674,0.828431,0.747467,0.770389,0.825503,0.570179
900,0.2745,0.465159,0.831688,0.77694,0.795873,0.838926,0.606161
1000,0.2633,0.47636,0.817293,0.764156,0.782265,0.82838,0.579016


TrainOutput(global_step=1876, training_loss=0.26590651591449405, metrics={'train_runtime': 124.9172, 'train_samples_per_second': 479.173, 'train_steps_per_second': 15.018, 'total_flos': 702708970111056.0, 'train_loss': 0.26590651591449405, 'epoch': 7.0})

In [9]:
trainer.evaluate(tokenized_dataset["validation"])

{'eval_loss': 0.45868271589279175,
 'eval_precision': 0.8114969158721201,
 'eval_recall': 0.7710305734788638,
 'eval_f1-score': 0.7860336277567093,
 'eval_accuracy': 0.8283796740172579,
 'eval_mcc': 0.5811202551820686,
 'eval_runtime': 0.6787,
 'eval_samples_per_second': 1536.868,
 'eval_steps_per_second': 48.626,
 'epoch': 7.0}