In [2]:
import torch 
from peft import LoraConfig, get_peft_model
import transformers 
import os 
os.environ["TRANSFORMERS_CACHE"] = "/scratch/jainit/yo"
from datasets import Dataset

import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging
import torch.nn as nn
import torch
from peft import LoraConfig, get_peft_model, set_peft_model_state_dict
import wandb
os.environ["WANDB_PROJECT"] = "LoRA" # name your W&B project 
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, RobertaTokenizerFast
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed

In [18]:
lora_config = LoraConfig(
    r=6 , 
    lora_dropout=0.5, 
    init_lora_weights=True,
    lora_alpha=16 ,
    target_modules=[
 'longformer.encoder.layer.0.attention.self.query',
    'longformer.encoder.layer.0.attention.self.key',
    'longformer.encoder.layer.0.attention.self.value',
    'longformer.encoder.layer.0.attention.self.query_global',
    'longformer.encoder.layer.0.attention.self.key_global',
    'longformer.encoder.layer.0.attention.self.value_global',
    'longformer.encoder.layer.0.intermediate.dense',
    'longformer.encoder.layer.0.output.dense',
    'longformer.encoder.layer.1.attention.self.query',
    'longformer.encoder.layer.1.attention.self.key',
    'longformer.encoder.layer.1.attention.self.value',
    'longformer.encoder.layer.1.attention.self.query_global',
    'longformer.encoder.layer.1.attention.self.key_global',
    'longformer.encoder.layer.1.attention.self.value_global',
    'longformer.encoder.layer.1.intermediate.dense',
    'longformer.encoder.layer.1.output.dense',
    'longformer.encoder.layer.2.attention.self.query',
    'longformer.encoder.layer.2.attention.self.key',
    'longformer.encoder.layer.2.attention.self.value',
    'longformer.encoder.layer.2.attention.self.query_global',
    'longformer.encoder.layer.2.attention.self.key_global',
    'longformer.encoder.layer.2.attention.self.value_global',
    'longformer.encoder.layer.2.intermediate.dense',
    'longformer.encoder.layer.2.output.dense',
    'longformer.encoder.layer.3.attention.self.query',
    'longformer.encoder.layer.3.attention.self.key',
    'longformer.encoder.layer.3.attention.self.value',
    'longformer.encoder.layer.3.attention.self.query_global',
    'longformer.encoder.layer.3.attention.self.key_global',
    'longformer.encoder.layer.3.attention.self.value_global',
    'longformer.encoder.layer.3.intermediate.dense',
    'longformer.encoder.layer.3.output.dense',
    'longformer.encoder.layer.4.attention.self.query',
    'longformer.encoder.layer.4.attention.self.key',
    'longformer.encoder.layer.4.attention.self.value',
    'longformer.encoder.layer.4.attention.self.query_global',
    'longformer.encoder.layer.4.attention.self.key_global',
    'longformer.encoder.layer.4.attention.self.value_global',
    'longformer.encoder.layer.4.intermediate.dense',
    'longformer.encoder.layer.4.output.dense',
    'longformer.encoder.layer.5.attention.self.query',
    'longformer.encoder.layer.5.attention.self.key',
    'longformer.encoder.layer.5.attention.self.value',
    'longformer.encoder.layer.5.attention.self.query_global',
    'longformer.encoder.layer.5.attention.self.key_global',
    'longformer.encoder.layer.5.attention.self.value_global',
    'longformer.encoder.layer.5.intermediate.dense',
    'longformer.encoder.layer.5.output.dense',
]
    ) # default config

In [46]:

tokeniser = RobertaTokenizerFast.from_pretrained("roberta-base")

In [47]:
def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results



In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [11]:
# print_trainable_parameters(lora_model)

In [50]:
# loading the data 
import pandas as pd
import json 
train_file_path = "/scratch/jainit/SubtaskA/subtaskA_train_monolingual.jsonl"
dev_file_path = "/scratch/jainit/SubtaskA/subtaskA_dev_monolingual.jsonl"


In [51]:
train_df = pd.read_json(train_file_path , lines = True )
val_df = pd.read_json(dev_file_path , lines = True )

In [52]:
def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True , padding = "max_length" , max_length = 512)

In [53]:
train_dataset = Dataset.from_pandas(train_df[:1000])
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokeniser},num_proc=8)
tokenized_train_dataset= tokenized_train_dataset.rename_column("label", "labels")


Map (num_proc=8): 100%|██████████| 1000/1000 [00:01<00:00, 922.74 examples/s] 
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [54]:
val_dataset = Dataset.from_pandas(val_df[:500])
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokeniser}, )
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map: 100%|██████████| 500/500 [00:00<00:00, 1191.12 examples/s]


In [1]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
       "kiddothe2b/longformer-mini-1024", num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
    )
print(model)
model.add_adapter(lora_config, adapter_name="adapter_1")
print_trainable_parameters(model)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(1026, 256, padding_idx=0)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-5): 6 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (query_global): Linear(in_features=256, out_features=256, bias=True)
              (key_global): Linear(in_features=256, out_features=256, bias=True)
            

In [57]:
data_collator = DataCollatorWithPadding(tokenizer=tokeniser)


# create Trainer 
training_args = TrainingArguments(
    output_dir="/scratch/jainit",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    run_name="Test_LoRA", 
    logging_steps=100,
    eval_steps=100,
    save_steps=10000,
    report_to="wandb",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokeniser,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.6117,0.614731,1.0


[34m[1mwandb[0m: Adding directory to artifact (/scratch/jainit/checkpoint-125)... Done. 0.1s
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


TrainOutput(global_step=125, training_loss=0.6063401870727539, metrics={'train_runtime': 37.7566, 'train_samples_per_second': 26.485, 'train_steps_per_second': 3.311, 'total_flos': 263790532608000.0, 'train_loss': 0.6063401870727539, 'epoch': 1.0})

In [58]:
# save best model
best_model_path = "./roberta"+'/bestest/'

if not os.path.exists(best_model_path):
    os.makedirs(best_model_path)

torch.save(model.state_dict(), best_model_path + 'model.pt')
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): ModuleDict(
                  (adapter_1): Dropout(p=0.5, inplace=False)
                )
                (lora_A): ModuleDict(
                  (adapter_1): Linear(in_features=768, out_features=6, bias=False)
                )
                (lora_B): ModuleDict(
                  (adapter_1): Li

In [59]:
new_model = AutoModelForSequenceClassification.from_pretrained(
         "roberta-base", num_labels=len(label2id), id2label=id2label, label2id=label2id    # put your model here
     )
new_model.add_adapter(lora_config, adapter_name="adapter_1")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
new_model.load_state_dict(torch.load(best_model_path + 'model.pt'))

<All keys matched successfully>

In [63]:

model.config.save_pretrained(best_model_path)

KeyError: 'default'