In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
import torch


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import pandas as pd
train_data = pd.read_csv('output.csv')
train_ds = Dataset.from_pandas(train_data)

In [5]:
train_data.head()

Unnamed: 0,query,context,label
0,A construction company entered into a contract...,Allahabad High Court\nM/S Zapdor-Ubc-Abnjv Del...,1
1,A construction company entered into a contract...,Supreme Court of India\nBgs Sgs Soma Jv vs Nhp...,1
2,A construction company entered into a contract...,Delhi High Court\nChacha Nehru Bal Chikitsalay...,1
3,A construction company entered into a contract...,Karnataka High Court\nMatra Mobili Private Lim...,0
4,A construction company entered into a contract...,Telangana High Court\nMr. K.N. Mahesh Prasad v...,0


In [None]:
model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)
lora_config = LoraConfig(
    r=8,
    target_modules="all-linear",
    bias="none",
    task_type="CAUSAL_LM",
)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def preprocess_data(examples):
    # Concatenate query and context element-wise
    inputs = [q + " " + c for q, c in zip(examples['query'], examples['context'])]
#     inputs = examples['Full Text']
    
    # Tokenize the inputs and truncate if necessary
    model_inputs = tokenizer(inputs, max_length=2048, truncation=True, padding='max_length')
    
    # Tokenize the Answer (target)
    label = tokenizer(examples['label'], max_length=1024, truncation=True, padding='max_length')
#     label = tokenizer(examples['Summary'], max_length=2048, truncation=True, padding='max_length')

    
    model_inputs["label"] = label["input_ids"]
    return model_inputs

# Apply the preprocessing function to the dataset
train_ds = train_ds.map(preprocess_data, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    logging_steps=10,
    gradient_accumulation_steps=4,  
    save_strategy="epoch",
    fp16=True,
    ddp_find_unused_parameters=False,
    report_to="none",  
)

In [7]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    max_seq_length=2048,
    args=training_args,
    peft_config=lora_config,
    train_dataset=train_ds,
    dataset_text_field="text",
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [8]:
trainer.train()

Step,Training Loss
10,1.3753
20,1.4024
30,1.3281
40,1.2485
50,1.2344
60,1.1602
70,1.1509
80,1.1454
90,1.1891
100,1.1092




TrainOutput(global_step=300, training_loss=1.1484027417500813, metrics={'train_runtime': 3537.0761, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.085, 'total_flos': 4.1743382740992e+16, 'train_loss': 1.1484027417500813, 'epoch': 3.0})

In [9]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/tokenizer.json')