In [1]:
!pip install -q -U torch transformers peft datasets bitsandbytes trl accelerate
!pip install -q -U scipy pandas scikit-learn

import torch
print(f"Torch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.9.0+cu126 requires torch==2.9.0, but you have torch 2.9.1 which is incompatible.
torchvision 0.24.0+cu126 requires torch==2.9.0, but you have torch 2.9.1 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mTorch Version: 2.9.1+cu128
CUDA Available: True


In [2]:
import json
from datasets import Dataset
from transformers import AutoTokenizer

# --- Configuration ---
MODEL_ID = "Qwen/Qwen3-0.6B-Base" # Using Qwen 2.5 (Stable SOTA small model)
MAX_SEQ_LENGTH = 1024

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for training

# 1. Load Raw JSON
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

train_data = load_json("/content/drive/MyDrive/LOLA/Code files/kaggle output/finetune_train.json")
val_data = load_json("/content/drive/MyDrive/LOLA/Code files/kaggle output/finetune_calibrate.json")
test_data = load_json("/content/drive/MyDrive/LOLA/Code files/kaggle output/finetune_test.json")

# 2. Formatting Function (ChatML style)
def apply_chat_template(example):
    messages = example["messages"]
    formatted_text = ""
    for message in messages:
        role = message["role"]
        content = message["content"]
        formatted_text += f"<|im_start|>{role}\n{content}<|im_end|>\n"

    # For training, we add the generation prompt implies the assistant should speak next
    # But since your data includes the assistant response in the JSON, the loop above covers it.
    # The model learns to predict the tokens after <|im_start|>assistant
    return {"text": formatted_text}

# 3. Create Hugging Face Datasets
train_dataset = Dataset.from_list(train_data).map(apply_chat_template)
val_dataset = Dataset.from_list(val_data).map(apply_chat_template)
# Note: Test dataset will be processed differently during inference

print(f"Train Samples: {len(train_dataset)}")
print(f"Sample Input:\n{train_dataset[0]['text'][:500]}...")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/12376 [00:00<?, ? examples/s]

Map:   0%|          | 0/1654 [00:00<?, ? examples/s]

Train Samples: 12376
Sample Input:
<|im_start|>system
You are an editor tasked with choosing the catchier one from several drafted headlines for the same article. Catchier means the one that is likely to generate more clicks.<|im_end|>
<|im_start|>user
You are presented with several headlines. Which one is catchier? **Return only the number before the headline. **No explanation is needed. No need to return the headline, only the number.****
1. New York's Last Chance To Preserve Its Water Supply
2. How YOU Can Help New York Stay U...


In [3]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# 1. Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# 2. Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare for k-bit training (freezes weights, casts layer norm to float32)
model = prepare_model_for_kbit_training(model)

# 3. LoRA Configuration
peft_config = LoraConfig(
    r=16,                    # Rank: Higher = more parameters to train (16 is standard)
    lora_alpha=32,           # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

trainable params: 10,092,544 || all params: 606,142,464 || trainable%: 1.6650


In [9]:
from transformers import TrainingArguments
from trl import SFTTrainer

OUTPUT_DIR = "qwen_finetuned_results"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=100, # Short run for demonstration. For real results, try 300-500 or 1 epoch.
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    args=training_args
)

print("Starting Training...")
trainer.train()

# Save the best adapter
ADAPTER_PATH = "final_adapter_checkpoint"
trainer.model.save_pretrained(ADAPTER_PATH)
tokenizer.save_pretrained(ADAPTER_PATH)
print(f"Model saved to {ADAPTER_PATH}")



Tokenizing train dataset:   0%|          | 0/12376 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/12376 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1654 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1654 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting Training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,1.5989,1.56585,1.605086,150746.0,0.720856
100,1.5557,1.548326,1.57026,300178.0,0.722957


  return fn(*args, **kwargs)


Model saved to final_adapter_checkpoint


In [10]:
import pandas as pd
import re
from tqdm import tqdm
import torch
from peft import PeftModel

# 1. Reload Base Model & Adapter for Inference
# We reload to ensure clean state
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

# 2. Load Test CSV for IDs
df_test_csv = pd.read_csv("/content/drive/MyDrive/LOLA/Code files/kaggle output/final_test.csv")
# We need a way to map the sequential JSON items to the CSV groups
# Assuming the 'finetune_test.json' was generated in the same order as unique 'test_id' groups in CSV
unique_test_ids = df_test_csv['test_id'].unique()

results = []

print("Generating predictions on Test Set...")

for i, item in enumerate(tqdm(test_data)):
    # Prepare Prompt: Remove the assistant's answer from the JSON messages if it exists (it shouldn't in true test, but just in case)
    messages = [m for m in item["messages"] if m["role"] != "assistant"]

    # Construct input text
    input_text = ""
    for message in messages:
        input_text += f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n"
    input_text += "<|im_start|>assistant\n" # Prompt for the answer

    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False # Greedy decoding for deterministic best option
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the response part (everything after the prompt)
    # Since we don't have the full raw string easily separable, we just look at the newly generated tokens
    # But 'decode' gives the whole string. Let's just regex search the last digit.

    # Simple heuristic: Find the number following "assistant"
    try:
        # Split by the prompt end if possible, or just search the whole string for the LAST number
        response_only = generated_text[len(input_text):] # This is rough if special tokens are skipped differently

        # Better: look for single digit 1-9
        match = re.search(r'\b([1-9])\b', generated_text.split("assistant")[-1])
        predicted_option = int(match.group(1)) if match else 1 # Default to 1 if parse fails
    except:
        predicted_option = 1

    # Get Test ID
    current_test_id = unique_test_ids[i] if i < len(unique_test_ids) else "Unknown"

    results.append({
        "test_id": current_test_id,
        "predicted_option": predicted_option
    })

# 3. Create Final CSV
results_df = pd.DataFrame(results)

# Optional: Join with original CSV to get the actual headline text for that option
# This is complex because we need to lookup (test_id, option_number) -> headline
merged_data = []
for idx, row in results_df.iterrows():
    tid = row['test_id']
    opt = row['predicted_option']

    # Find row in original csv
    match = df_test_csv[(df_test_csv['test_id'] == tid) & (df_test_csv['option_number'] == opt)]

    if not match.empty:
        headline_text = match.iloc[0]['headline']
    else:
        headline_text = "Headline not found"

    merged_data.append({
        "test_id": tid,
        "predicted_option": opt,
        "best_headline": headline_text
    })

final_df = pd.DataFrame(merged_data)
final_df.to_csv("predicted_best_headlines.csv", index=False)
print("Saved predictions to 'predicted_best_headlines.csv'")
print(final_df.head())

Generating predictions on Test Set...


100%|██████████| 3263/3263 [39:10<00:00,  1.39it/s]


Saved predictions to 'predicted_best_headlines.csv'
   test_id  predicted_option  \
0    14444                 1   
1    14514                 1   
2    14691                 1   
3    15003                 1   
4    15019                 1   

                                       best_headline  
0  A Reading Of 'Dinner With Monoliths' By Joseph...  
1  A Music Video With All Of My Favorite People, ...  
2  A Father Wrote His Kid 14 Things To Always Rem...  
3  There’s Nothing Funny About Ferguson. But This...  
4  This Is Not Your Typical 'Inspiration Porn' St...  
