In [1]:
!pip install -q transformers accelerate bitsandbytes datasets peft
!pip install "fsspec==2024.12.0" "gcsfs==2024.12.0"
!pip install -U bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install -U transformers




In [3]:
## ✅ Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_name = "microsoft/phi-2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load model in 8-bit precision
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Adjust if needed
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)


In [None]:
import shutil

# Google Drive path
drive_path = "/content/drive/MyDrive/financial AI advisor/financial_finetune.json"

# Local Colab path
local_path = "/content/financial_finetune.json"

# Copy from Drive to Colab local filesystem
shutil.copy(drive_path, local_path)


In [None]:
import json

# Load JSON manually
with open("/content/financial_finetune.json", "r") as f:
    raw_data = json.load(f)

# Print a few examples to verify structure
for i, item in enumerate(raw_data[:3]):
    print(f"Item {i}: Type = {type(item)}, Content = {item}")


In [None]:
import json
from datasets import Dataset

# Clean and fix structure
cleaned_data = []
for item in raw_data:
    if isinstance(item, dict) and 'instruction' in item and 'output' in item:
        fixed_item = item.copy()
        for k, v in fixed_item.items():
            if isinstance(v, dict):
                fixed_item[k] = json.dumps(v)  # Convert dict to string
        cleaned_data.append(fixed_item)

# Now it's safe to create HF dataset
dataset = Dataset.from_list(cleaned_data)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
tokenizer.pad_token = tokenizer.eos_token

def format_prompt(example):
    # If you had an 'input' field that was stringified JSON, include it
    if 'input' in example:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(format_prompt)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
import torch
import os

# === Config ===
model_name = "microsoft/phi-2"
output_dir = "/content/drive/MyDrive/financial AI advisor/phi2-finetuned"
checkpoint_dir = os.path.join(output_dir, "checkpoint-0")  # or just output_dir if you save in-place
device = "cuda" if torch.cuda.is_available() else "cpu"

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# === BitsAndBytes 8-bit Quantization ===
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None
)

# === Load Model in 8-bit ===
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# === LoRA Setup ===
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)

# === Enable Gradient Checkpointing ===
model.gradient_checkpointing_enable()
model.config.use_cache = False

# === Data Collator ===
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# === Training Arguments ===
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    save_strategy="steps",         # <-- Save by steps instead of epoch
    save_steps=400,                # <-- Save every 400 steps
    save_total_limit=2,
    overwrite_output_dir=True,
    logging_dir="./logs",
    report_to="none"
)


# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# === Train with Checkpoint Fallback ===
if os.path.exists(os.path.join(output_dir, "checkpoint-0")) or any(
    d.startswith("checkpoint-") for d in os.listdir(output_dir)
):
    print("✅ Found checkpoint — resuming training...")
    trainer.train(resume_from_checkpoint=True)
else:
    print("🚀 No checkpoint found — starting from scratch...")
    trainer.train()

# === Save final model and tokenizer ===
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
import torch
print(torch.cuda.is_available())
