# Basic Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import json
import subprocess
import sys
from pathlib import Path

# Installing Dependencies

In [5]:
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "trl", "transformers", "peft", "datasets", "bitsandbytes"])


0

# Imports Libraries

In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import numpy as np

# CONFIGURATION

In [None]:
MAX_SEQ_LENGTH=2048
DTYPE=torch.float16
LOAD_IN_4BIT=True
OUTPUT_DIR="/content/drive/MyDrive/final_model"
CHECKPOINT_DIR = "/content/drive/MyDrive/checkpoints"
EPOCHS=3
BATCH_SIZE=1
GRADIENT_ACCUMULATION_STEPS=16
LEARNING_RATE=2e-4
SAVE_STEPS=1000
DATASET_SIZE=8000
MODEL_NAME = "meta-llama/Llama-2-7b-hf"

os.makedirs(OUTPUT_DIR,exist_ok=True)
os.makedirs(CHECKPOINT_DIR,exist_ok=True)

# DATASET LOADING

In [None]:
def load_and_preprocess(json_path,num_samples=8000):
  data=[]

  if json_path.endswith("jsonl"):
    with open(json_path,"r") as f:
      for line in f:
        data.append(json.loads(line))
  else:
    with open(json_path,"r") as f:
      content=f.read()
      data =json.loads(content)

  data=data[:num_samples]
  np.random.shuffle(data)

  return data

def format_prompt(example):
  prompt=example['prompt']
  completion=example['Completion']
  formatted=f"<s>[INST]{prompt}[/INST]{completion}</s>"
  return {"text":formatted}


dataset_paths=[
    "/content/drive/MyDrive/Data/Custome_twitter.json"
]

dataset_path=None
for path in dataset_paths:
  if os.path.exists(path):
    dataset_path=path
    break



if dataset_path is None or not os.path.exists(dataset_path):
    print("No dataset found at the specified path.")
else:
    raw_data = load_and_preprocess(dataset_path, DATASET_SIZE)
    formatted_data = [format_prompt(item) for item in raw_data]
    dataset = Dataset.from_dict({"text": [item["text"] for item in formatted_data]})
    print(f"Loaded Dataset - {len(dataset)} samples")


 # MODEL LOADING

In [None]:
model,tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.float16,
    load_in_4bit=LOAD_IN_4BIT,
)

model=FastLanguageModel.get_peft_model(
    model=model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42
)

print("Mode is loaded for Fine Tunning")

# CHECKPOINT DETECTION & RESUME LOGIC

In [None]:
def find_latest_checkpoint(checkpoints_dir):
  if os.path.exists(checkpoints_dir):
    return None

  checkpoints=[d for d in os.listdir(checkpoints_dir) if d.startswith("checkpoint-")]
  if not checkpoints:
    return None


  checkpoint_numbers=[int(d.split("-")[1]) for d in checkpoints]
  latest_num=max(checkpoint_numbers)
  latest_checkpoint=os.path.join(checkpoints_dir,f"checkpoint-{latest_num}")
  return latest_checkpoint


resume_from_checkpoint=find_latest_checkpoint(CHECKPOINT_DIR)

if resume_from_checkpoint:
  print(f"Found checkpoint: {resume_from_checkpoint}")
else:
  print("No checkpoint found. Starting fresh training...")
  resume_from_checkpoint = None

# TRAINING ARGUMENTS

In [None]:
trainning_args=TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    learning_rate=LEARNING_RATE,
    fp16=True,
    bf16=False,
    logging_steps=10,
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    optim="adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=1.0,
    seed=42,
    report_to=["tensorboard"]
    )

# SFT TRAINER SETUP

In [None]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=trainning_args,
    packing=False,
)

#  TRAINING

In [None]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

# SAVE FINAL MODEL

In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

text = "Is it new Iphone worthy to buy"
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Is it new Iphone worthy to buy?
The newest Iphone is out. Is it worth to buy?
Yes, it is. If you can afford it.
Yeah I have the 6s and it’s good. I’m thinking of getting the 7 plus but I don’t know if I want to spend that much on a phone.
@123311 it’s definitely worth it. I’ve had my
