#### Installing Libraries

In [2]:
pip install transformers datasets peft accelerate bitsandbytes



#### Importing Libraries

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset

#### 1. Load model and tokenizer

In [4]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# GPT2 does not have pad_token — assign eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#### 2. Load and tokenize dataset

In [5]:
!pip install -U fsspec==2023.6.0



In [20]:
dataset = load_dataset("imdb", split="train[:1%]") # Use small subset

def tokenize(example):
    # Tokenize the 'text' column for the imdb dataset
    # Explicitly pad to max_length
    tokenized_example = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    return tokenized_example

tokenized_ds = dataset.map(tokenize, batched=True)

# Remove original columns that are not needed for training
tokenized_ds = tokenized_ds.remove_columns(["text", "label"])

# Add a 'labels' column by copying 'input_ids'
tokenized_ds = tokenized_ds.add_column("labels", tokenized_ds["input_ids"])

print("Columns after processing:", tokenized_ds.column_names)
print("Example tokenized_ds features:", tokenized_ds[0].keys()) # Print features of one example
print("Example input_ids length:", len(tokenized_ds[0]['input_ids'])) # Print length of input_ids for one example

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Columns after processing: ['input_ids', 'attention_mask', 'labels']
Example tokenized_ds features: dict_keys(['input_ids', 'attention_mask', 'labels'])
Example input_ids length: 128


  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


#### 3. Add labels

In [8]:
def add_labels(example):
    example["labels"] = example["input_ids"].copy()
    return example

tokenized_ds = tokenized_ds.map(add_labels)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

#### 4. Apply LoRA PEFT configuration

In [9]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)



#### 5. Training Arguments

In [10]:
training_args = TrainingArguments(
    output_dir="./gpt2-lora",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=50,
    fp16=False,  # Set to False if not on GPU with mixed-precision
    report_to="none",
    save_total_limit=2,
    remove_unused_columns=False  # important for avoiding label mismatch error
)

# Explicitly define and use a data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# from datasets import disable_caching
# disable_caching()

#### 6. Trainer

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# start training
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.8844
20,3.9178
30,4.0244
40,3.8193
50,3.7207
60,3.7301


TrainOutput(global_step=63, training_loss=3.8476415058923146, metrics={'train_runtime': 8.8111, 'train_samples_per_second': 28.373, 'train_steps_per_second': 7.15, 'total_flos': 16486465536000.0, 'train_loss': 3.8476415058923146, 'epoch': 1.0})

#### Save Models


In [22]:
model.save_pretrained("./gpt2-lora-finetuned")
tokenizer.save_pretrained("./gpt2-lora-finetuned")

('./gpt2-lora-finetuned/tokenizer_config.json',
 './gpt2-lora-finetuned/special_tokens_map.json',
 './gpt2-lora-finetuned/vocab.json',
 './gpt2-lora-finetuned/merges.txt',
 './gpt2-lora-finetuned/added_tokens.json',
 './gpt2-lora-finetuned/tokenizer.json')

#### Checking Inference

In [23]:
from transformers import pipeline

# Load trained model
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("./gpt2-lora-finetuned")
tokenizer = AutoTokenizer.from_pretrained("./gpt2-lora-finetuned")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "The movie was absolutely"
output = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.8)
print(output[0]["generated_text"])


Device set to use cuda:0


The movie was absolutely brilliant and had a lot of fun.

I don't even know if he's a good actor. I think he looks like he's been playing a bad guy who gets caught up in something.

I didn't get into the movie


In [24]:
prompt = "May be movie"
output = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.8)
print(output[0]["generated_text"])

May be movie director, though I'll admit that I'm not sure if the actors in this film were there when I watched the movie.

A few other things that happened after this story arc of the movie. The first is that in the first half,


In [25]:
prompt = "what was the movie name, and"
output = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.8)
print(output[0]["generated_text"])

what was the movie name, and what was the theme song, but that was all my choice of what to say."

"Wasn't that fun, really. I had to be very careful."

"I did some improv work too and never really got much done
