# **Cell 1: Install Dependencies**

In [1]:
!pip install -q torch transformers peft trl bitsandbytes accelerate datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **Cell 2: The Training Script**

In [3]:
import os
import json
import torch
import pandas as pd
import logging
from datasets import load_dataset

from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

# Set environment variable to potentially help with memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# --- Constants ---
BASE_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
NEW_ADAPTER_NAME = "aida-phi3-mini-sre-adapter-v1"
DATASET_FILE = "aida_training_dataset.jsonl"

# --- Load Dataset ---
print(f"Loading dataset from {DATASET_FILE}")
dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
print(f"Successfully loaded {len(dataset)} training samples.")

# Assuming the text data is in a column named 'text', if not, rename it
# You might need to inspect your dataset file to confirm the actual column name
# If your text column has a different name, replace 'text' below with the correct name
if "text" not in dataset.column_names:
    # This is a placeholder, you need to replace 'your_text_column_name'
    # with the actual name of the column containing the text in your JSONL file
    your_text_column_name = list(dataset.column_names)[0] # Assuming the first column is text if 'text' is not found
    print(f"Renaming column '{your_text_column_name}' to 'text'")
    dataset = dataset.rename_column(your_text_column_name, "text")


# --- Configure Training ---
print("--- Initializing Fine-Tuning Pipeline ---")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Changed from torch.bfloat16
    bnb_4bit_use_double_quant=False,
)

print(f"Loading base model: {BASE_MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=quant_config,
    trust_remote_code=True,
    device_map={"": 0},
    config={"pretraining_tp": 1} # Set pretraining_tp to 1
)

# Add this line to set _from_remote_code to True
model.config._from_remote_code = True

model.config.use_cache = False
# Removed the line model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"], # Added target_modules
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1, # Already set to 1, consider other options if OOM persists
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    logging_steps=1, # Log every step since our dataset is small
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # Removed dataset_text_field argument
    # Removed max_seq_length argument
    # Removed tokenizer argument
    args=training_args,
    # Removed packing=False,
)

# --- Start Training ---
print("--- Starting model training ---")
trainer.train()
print("--- Model training complete ---")

# --- Save Model ---
print(f"Saving trained adapter to ./{NEW_ADAPTER_NAME}")
trainer.model.save_pretrained(NEW_ADAPTER_NAME)
print("--- Pipeline Finished Successfully ---")

Loading dataset from aida_training_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Successfully loaded 1 training samples.
--- Initializing Fine-Tuning Pipeline ---
Loading base model: microsoft/Phi-3-mini-4k-instruct


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


--- Starting model training ---


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myemregultepe[0m ([33myunusemregultepe[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
1,1.4778




--- Model training complete ---
Saving trained adapter to ./aida-phi3-mini-sre-adapter-v1
--- Pipeline Finished Successfully ---
