# Fine tune with Trainer

In [1]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments


In [2]:
# Step 1: Load a small pre-trained model and tokenizer
# Using distilgpt2, which is a small, lightweight GPT-2 model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

In [3]:
# Step 2: Define a simple instruction dataset
# For demonstration purposes, we create a basic dataset with prompt-response pairs.
# Replace this with a real instruction-tuning dataset for actual use.
instruction_data = {
    "train": [
        {"prompt": "Translate to French: Hello, how are you?", "response": "Bonjour, comment ça va?"},
        {"prompt": "Explain what a black hole is.", "response": "A black hole is a region of space where gravity is so strong that nothing, not even light, can escape."},
        {"prompt": "What is 2 + 2?", "response": "2 + 2 equals 4."},
        {"prompt": "Translate to Spanish: I love programming.", "response": "Me encanta programar."},
        {"prompt": "What is the boiling point of water in Celsius?", "response": "The boiling point of water is 100 degrees Celsius."},
        {"prompt": "Define photosynthesis.", "response": "Photosynthesis is the process by which green plants use sunlight to synthesize food from carbon dioxide and water."},
        {"prompt": "What is the square root of 16?", "response": "The square root of 16 is 4."},
        {"prompt": "Translate to German: Good morning.", "response": "Guten Morgen."},
        {"prompt": "Who wrote 'Pride and Prejudice'?", "response": "Jane Austen wrote 'Pride and Prejudice'."},
        {"prompt": "Explain gravity.", "response": "Gravity is the force that attracts objects toward each other, especially the force that makes things fall to the ground."},
        {"prompt": "What is the capital of Germany?", "response": "The capital of Germany is Berlin."},
        {"prompt": "Translate to Italian: See you tomorrow.", "response": "Ci vediamo domani."},
        {"prompt": "Who painted the Mona Lisa?", "response": "The Mona Lisa was painted by Leonardo da Vinci."},
        {"prompt": "Explain what DNA is.", "response": "DNA is the molecule that carries the genetic instructions for life."},
        {"prompt": "What is the largest planet in our solar system?", "response": "The largest planet in our solar system is Jupiter."},
        {"prompt": "Translate to Japanese: Thank you very much.", "response": "どうもありがとうございます (Dōmo arigatō gozaimasu)."},
        {"prompt": "What is the speed of light?", "response": "The speed of light is approximately 299,792 kilometers per second."},
        {"prompt": "Define a computer.", "response": "A computer is an electronic device that processes and stores data, capable of performing calculations and tasks."},
        {"prompt": "Who discovered penicillin?", "response": "Alexander Fleming discovered penicillin."},
        {"prompt": "Translate to French: Where is the bathroom?", "response": "Où sont les toilettes?"},
        {"prompt": "What is 5 factorial (5!)?", "response": "5 factorial, or 5!, is 120."},
        {"prompt": "Translate to Spanish: Have a nice day.", "response": "Que tengas un buen día."},
        {"prompt": "Explain the water cycle.", "response": "The water cycle is the continuous movement of water on Earth through evaporation, condensation, and precipitation."},
        {"prompt": "What is the capital of Japan?", "response": "The capital of Japan is Tokyo."},
        {"prompt": "Define renewable energy.", "response": "Renewable energy comes from natural sources that can be replenished, like solar and wind power."},
        {"prompt": "Translate to German: Happy birthday.", "response": "Alles Gute zum Geburtstag."},
        {"prompt": "What is the main ingredient in guacamole?", "response": "The main ingredient in guacamole is avocado."},
        {"prompt": "Explain what an atom is.", "response": "An atom is the smallest unit of matter that retains the properties of an element."},
        {"prompt": "What is the freezing point of water in Fahrenheit?", "response": "The freezing point of water is 32 degrees Fahrenheit."},
        {"prompt": "Translate to Italian: How much does this cost?", "response": "Quanto costa questo?"},
        {"prompt": "Who developed the theory of relativity?", "response": "Albert Einstein developed the theory of relativity."},
        {"prompt": "Explain photosynthesis in simple terms.", "response": "Photosynthesis is how plants make their food using sunlight, water, and air."},
        {"prompt": "What is the capital of Australia?", "response": "The capital of Australia is Canberra."},
        {"prompt": "Translate to Japanese: Good evening.", "response": "こんばんは (Konbanwa)."},
        {"prompt": "Who is the author of '1984'?", "response": "The author of '1984' is George Orwell."},
        {"prompt": "Define gravity.", "response": "Gravity is the force that pulls objects toward each other."},
        {"prompt": "Translate to French: I am happy.", "response": "Je suis heureux/heureuse."},
        {"prompt": "What is the currency of the United Kingdom?", "response": "The currency of the United Kingdom is the British Pound."},
        {"prompt": "Explain what a neuron is.", "response": "A neuron is a nerve cell that transmits signals in the body."},
        {"prompt": "What is the smallest country in the world?", "response": "The smallest country in the world is Vatican City."},
        {"prompt": "Translate to Spanish: What time is it?", "response": "¿Qué hora es?"},
        {"prompt": "Who painted the Sistine Chapel ceiling?", "response": "Michelangelo painted the Sistine Chapel ceiling."},
        {"prompt": "Define an ecosystem.", "response": "An ecosystem is a community of living organisms interacting with their environment."},
        {"prompt": "What is the chemical formula of water?", "response": "The chemical formula of water is H₂O."},
        {"prompt": "Translate to German: Where are you from?", "response": "Woher kommst du?"},
        {"prompt": "Who discovered gravity?", "response": "Sir Isaac Newton is credited with discovering gravity."},
        {"prompt": "Explain the theory of evolution.", "response": "The theory of evolution states that species change over time through natural selection."},
        {"prompt": "What is the capital of Canada?", "response": "The capital of Canada is Ottawa."},
        {"prompt": "Translate to Italian: Good luck.", "response": "Buona fortuna."},
        {"prompt": "What is the value of pi?", "response": "The value of pi is approximately 3.14159."}
    ],
    "test": [
        {"prompt": "What is the capital of France?", "response": "The capital of France is Paris."},
        {"prompt": "Translate to Spanish: Good morning.", "response": "Buenos días."},
        {"prompt": "What is 10 divided by 2?", "response": "10 divided by 2 equals 5."},
        {"prompt": "Translate to French: Thank you.", "response": "Merci."},
        {"prompt": "What is the largest ocean on Earth?", "response": "The largest ocean on Earth is the Pacific Ocean."},
        {"prompt": "Who wrote 'Hamlet'?", "response": "William Shakespeare wrote 'Hamlet'."},
        {"prompt": "Translate to Japanese: Goodbye.", "response": "さようなら (Sayōnara)."},
        {"prompt": "What is the capital of Italy?", "response": "The capital of Italy is Rome."},
        {"prompt": "Define velocity.", "response": "Velocity is the speed of an object in a specific direction."},
        {"prompt": "Translate to German: How are you?", "response": "Wie geht's?"}
    ]
}


# Step 3: Prepare the dataset for training
# Tokenize the dataset to convert text into input IDs and labels
def tokenize_function(example):
    input_text = f"Instruction: {example['prompt']}\nResponse: {example['response']}"
    tokenized = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"],  # Add labels for causal LM
    }


# Convert data into a Dataset object and tokenize
from datasets import DatasetDict, Dataset
raw_datasets = DatasetDict({
    "train": Dataset.from_list(instruction_data["train"]),
    "test": Dataset.from_list(instruction_data["test"]),
})

tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
tokenized_datasets = tokenized_datasets.remove_columns(["prompt", "response"])  # Clean extra columns



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [5]:
# Step 4: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=1,
    save_total_limit=2,
    fp16=True,  # Use mixed precision if supported
    report_to='none',

)
# Step 5: Create a Trainer instance
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["test"],    # Evaluation dataset
    tokenizer=tokenizer,                 # Tokenizer for processing inputs
)



  trainer = Trainer(


In [6]:
# Step 6: Train the model
trainer.train()

# Step 7: Save the fine-tuned model
trainer.save_model("./fine_tuned_model")



Epoch,Training Loss,Validation Loss
1,0.2014,0.110495
2,0.1666,0.080162
3,0.1127,0.073244


In [13]:
# Step 8: Test the fine-tuned model
# Generate a response for a given instruction
test_prompt = "What is the value of pi?"
input_text = f"Instruction: {test_prompt}\nResponse:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate response using the fine-tuned model
output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Response:")
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Response:
Instruction: What is the value of pi?
Response: The value of pi is approximately 1.


# GEMINI

In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("imdb") 
# dataset = load_dataset("glue", "mrpc") 

# Define the model and tokenizer
model_name = "bert-base-uncased" 
# model_name = "distilgpt2"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) 
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [15]:
# Tokenize the dataset
# def tokenize_function(examples):
#     print(type(examples))
#     tokenized = tokenizer.tokenize(examples["text"], examples["label"], truncation=True,
#         padding="max_length",
#         max_length=512,)
#     # return tokenized
#     return {
#         "input_ids": tokenized["input_ids"],
#         "attention_mask": tokenized["attention_mask"],
#         "labels": tokenized["input_ids"],  # Add labels for causal LM
#     }

def tokenize_function(example):
    input_text = f"Instruction: {example['text']}\nResponse: {example['label']}"
    tokenized = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"],  # Add labels for causal LM
    }

tokenized_datasets = dataset.map(tokenize_function, batched=False)





Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none',
)

In [None]:
# Create and train the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()