# Importing Libraries

In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

# Load Dataset and Tokenizer

In [3]:
dataset = load_dataset("Abirate/english_quotes")
dataset_split = dataset['train'].train_test_split(test_size=0.1, seed=35)

train_data = dataset_split['train']
val_dataset = dataset_split['test']

model_name = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# Tokenize the Data

In [4]:
def tokenized(batch):
  tokenized = tokenizer(batch['quote'], padding='max_length', max_length=64, truncation=True)

  tokenized['labels'] = tokenized['input_ids'].copy()
  return tokenized

train_data = train_data.map(tokenized, batched=True)
val_dataset = val_dataset.map(tokenized, batched=True)

Map:   0%|          | 0/2257 [00:00<?, ? examples/s]

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

# Load Model

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16,
    device_map = "auto")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Lora config

In [6]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['c_attn'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)






# Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir="./lora-lm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    save_steps=10,
    learning_rate=2e-4,
    num_train_epochs=5,
    fp16=True,
    report_to="none"
)


# Trainer + Training the model

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
10,6.2518,6.500947
20,5.4298,5.178981
30,4.2832,3.589407
40,3.0104,2.162248
50,2.1631,2.004877
60,2.0948,1.93242
70,2.1488,1.860501
80,1.9689,1.805996
90,1.9763,1.755809
100,1.8181,1.713689


TrainOutput(global_step=1415, training_loss=1.7411934195474685, metrics={'train_runtime': 502.666, 'train_samples_per_second': 22.45, 'train_steps_per_second': 2.815, 'total_flos': 369863056097280.0, 'train_loss': 1.7411934195474685, 'epoch': 5.0})

# Saving the model and tokenizer

In [12]:
model.save_pretrained("FineTuned-lora-GPT2")
tokenizer.save_pretrained("FineTuned-lora-GPT2")

('FineTuned-lora-GPT2/tokenizer_config.json',
 'FineTuned-lora-GPT2/special_tokens_map.json',
 'FineTuned-lora-GPT2/vocab.json',
 'FineTuned-lora-GPT2/merges.txt',
 'FineTuned-lora-GPT2/added_tokens.json',
 'FineTuned-lora-GPT2/tokenizer.json')

# Inference From Saved Model

In [13]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

#  Load tokenizer
base_model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained("FineTuned-lora-GPT2")
tokenizer.pad_token = tokenizer.eos_token

#  Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "FineTuned-lora-GPT2")

#  Build pipeline
text_gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)


Device set to use cuda:0


In [15]:
prompt = "The secret to happiness is"
outputs = text_gen(prompt, max_new_tokens=70, num_return_sequences=1, do_sample=True, temperature=0.7)

print(outputs[0]["generated_text"])

The secret to happiness is to live life in harmony. There is no way to live without war. The only way to live is to live together. Only peace is possible when you live together.

I wrote to him at one point about my love for his wife. In my mind's eye we read of two things that I love: love and war. I love


In [16]:
prompt = "once upon a time"
outputs = text_gen(prompt, max_new_tokens=70, num_return_sequences=1, do_sample=True, temperature=0.7)

print(outputs[0]["generated_text"])

once upon a time when you were just as good a person as you were a day later. Now, you're as bad as you were when you were a day earlier. And you're as bad as you were when you were a day earlier. And now, you're as bad as you were when you were a day earlier. And now, you're as bad
