# --> 2.1) Fine Tuning 1: Normal

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset




In [2]:
model_name = 'Qwen/Qwen2-1.5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16
)

dataset = load_dataset("lucadillenburg/startup-chatbot")

train.csv:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating train split:   0%|          | 0/1226 [00:00<?, ? examples/s]

In [3]:
def format_as_chat(example):
    # Handle None values gracefully
    instruction = example["instruction"] or ""
    output = example["output"] or ""

    # Return only if both are non-empty
    if instruction.strip() == "" or output.strip() == "":
        return {"text": None}  # Will be filtered out later

    messages = [
        {"role": "system", "content": "You are a helpful assistant specialized in aviation."},
        {"role": "user", "content": instruction},
        {"role": "assistant", "content": output}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": text}

formatted_dataset = dataset.map(format_as_chat)

Map:   0%|          | 0/1226 [00:00<?, ? examples/s]

In [4]:
# Filter again to ensure only non-empty strings are retained
def is_valid_text(example):
    return example["text"] is not None and isinstance(example["text"], str) and example["text"].strip() != ""

formatted_dataset = formatted_dataset.filter(is_valid_text)

Filter:   0%|          | 0/1226 [00:00<?, ? examples/s]

In [5]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens   

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1226 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir="./qwen-avaition-finetuned",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    gradient_accumulation_steps=4,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)

trainer.train()



Step,Training Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained("./qwen-aviation-finetuned")
tokenizer.save_pretrained("./qwen-aviation-finetuned")

finetuned_model = AutoModelForCausalLM.from_pretrained("./qwen-aviation-finetuned")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./qwen-aviation-finetuned")

In [None]:
# Create inference pipeline
from transformers import pipeline

chatbot = pipeline("text-generation", model=finetuned_model, tokenizer=finetuned_tokenizer)

# Test a sample question
response = chatbot("What is a startup?", max_new_tokens=100)
print(response[0]["generated_text"])