In [2]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[0m

In [3]:
!nvidia-smi -L

GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-1b05da60-e4d7-2900-e930-248e1f2c401a)


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-j-6b",
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
tokenizer.pad_token = tokenizer.eos_token

In [2]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 7340032 || all params: 6058222816 || trainable%: 0.12115817167725645


In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling

# Load the datasets
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_without_reasoning.csv",
    block_size=256,
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val_without_reasoning.csv",
    block_size=256,
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)



In [6]:
from transformers import Trainer, TrainingArguments

# Define the parameters for fine-tuning
lr = 1e-5
end_lr = 2e-6
num_train_epochs = 1
warmup_steps = 100

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='gptj-without-reasoning-results',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=warmup_steps,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        weight_decay=0.1,
        fp16=True,
        logging_steps=10,
        logging_dir='./logs',
        evaluation_strategy="steps",
        eval_steps=50,
        
    ),
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
model.config.use_cache = False

In [7]:
trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mharsha-surampudi1997[0m ([33mharshasurampudi[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
50,2.2569,2.246367
100,2.2081,2.158312
150,1.9833,2.018227
200,1.9459,1.9439
250,1.8991,1.89666
300,1.8292,1.868124
350,1.8576,1.850322
400,1.825,1.838317
450,1.8339,1.829886
500,1.8023,1.824124


wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.


In [9]:
model.push_to_hub("harsha28/gptj-lfqa-without-reasoning",
                  use_auth_token=True,
                  commit_message="lr 1e-5, 1 epoch",
                  private=True)

adapter_model.bin:   0%|          | 0.00/29.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/harsha28/gptj-lfqa-without-reasoning/commit/c28f83383947410c1f81275f8c3d8502af82dfd4', commit_message='lr 1e-5, 1 epoch', commit_description='', oid='c28f83383947410c1f81275f8c3d8502af82dfd4', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "harsha28/gptj-lfqa-without-reasoning"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")


# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)



In [32]:
model = model.merge_and_unload()

ValueError: Cannot merge LORA layers when the model is loaded in 8-bit mode

In [41]:
batch = tokenizer("context: Stat. § 14-32.1(a), does not make the definition an essential element of the crime pursuant to N.C. Gen. Stat. § 14-32.1(e). Therefore, we reject Defendant’s argument that it is not sufficient for the indictment to “merely state that the victim was ‘handicapped.’ ” Furthermore, the indictment provided Defendant with enough information to prepare a defense for the offense of felony assault on a handicapped person. See Leonard, _ N.C. App. at _, 711 S.E.2d at 873 (rejecting the defendant’s argument that the indictment was not sufficient because the indictment tracked the relevant language of the statute, listed “the essential elements of the offense[,]” and provided the defendant “with enough information to prepare a defense”); State v. Crisp, 126 N.C. App. 30, 36, 483 S.E.2d 462, 466 (<HOLDING>), appeal dismissed and disc. review denied, 346 question: Is it necessary for the definition of the crime to be stated in the indictment according to N.C. Gen. Stat. § 14-32.1(a)? answer: ", return_tensors='pt')

# Move batch to CUDA
batch = {k: v.to('cuda') for k, v in batch.items()}

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=150)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.






In [43]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Load CSV
df = pd.read_csv("/data/lfqa/test_no_reasoning.csv")


# Create or clear the output CSV, saving only the headers
df[['input', 'expected_output']].to_csv("test_no_reasoning_output_gptj.csv", index=False)

# Loop through each row in the dataframe with tqdm for progress bar
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    batch = tokenizer(
        row['input'], 
        return_tensors='pt'
    )

    # Move batch to CUDA
    batch = {k: v.to('cuda') for k, v in batch.items()}

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**batch, max_new_tokens=150)
    
    # Decode and extract only the newly generated part (exclude the input text)
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    generated_text = generated_text.replace(row['input'], '').strip()  # remove input from generated output

    # Save this row immediately to the CSV
    output_df = pd.DataFrame({
        'input': [row['input']],
        'expected_output': [row['expected_output']],
        'generated_output': [generated_text]
    })
    output_df.to_csv("test_no_reasoning_output_gptj.csv", mode='a', header=False, index=False)


  0%|          | 0/1500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/1500 [00:24<10:20:01, 24.82s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/1500 [00:50<10:27:15, 25.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/1500 [01:15<10:26:11, 25.10s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/1500 [01:40<10:24:41, 25.05s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/1500 [02:04<10:16:56, 24.76s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/1500 [02:29<10:22:09, 24.99s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/1500 [02:54<10:22:16, 25.01s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 8/1500 [03

KeyboardInterrupt: 