In [None]:
# Install bitsandbytes
!pip install datasets
!pip install bitsandbytes
!pip install optuna

# Import required libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import TrainerCallback
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import Dataset, DatasetDict
import torch
import os
import subprocess
import threading
import time
from google.colab import userdata
import gc
import optuna

from huggingface_hub import login
# from kaggle_secrets import UserSecretsClient

import torch
print(torch.cuda.is_available())

# Get Hugging Face token from Kaggle secrets
# user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("HF_TOKEN") # kaggle

hf_token = userdata.get("HF_TOKEN") # google colab

# Log in to Hugging Face
login(token=hf_token)

# Callback to track epoch duration
class EpochTimeTracker(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, args, state, control, **kwargs):
        loss = ""
        eval_loss = ""
        epoch_end_time = time.time()
        epoch_duration = epoch_end_time - self.epoch_start_time
        print(f"Epoch {state.epoch} took {epoch_duration:.2f} seconds.")
        for log in state.log_history:
            if 'loss' in log:
                loss = log['loss']
            if 'eval_loss' in log:
                eval_loss = log['eval_loss']

        print(f"Epoch {state.epoch - 1}, loss: {loss}, eval_loss: {eval_loss}")

def load_dataset(chunk_size):
    # data_dir = "/kaggle/input/hunting-beast-youtube-transcripts/transcripts/concise"
    # output_file = "/kaggle/working/processed_text.txt" # kaggle
    data_dir = "/content/transcripts"
    output_file = "/content/processed_text.txt" # google colab

    all_text = ""

    for file in os.listdir(data_dir):
        file_path = os.path.join(data_dir, file)
        with open(file_path, "r", encoding="utf-8") as f:
            all_text += f.read()

    all_text = all_text[:(len(all_text) // 2)]

    word_count = len(all_text.split())  # Counts words
    print("Word count: ", word_count)

    # Save the `all_text` to the file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(all_text)

    text_chunks = [{"text": all_text[i:i + chunk_size]} for i in range(0, len(all_text), chunk_size)]
    full_dataset = Dataset.from_list(text_chunks)

    split_ratio = 0.8
    train_size = int(split_ratio * len(full_dataset))
    train_dataset = full_dataset.select(range(train_size))
    test_dataset = full_dataset.select(range(train_size, len(full_dataset)))
    dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

    return dataset


def load_model():
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    # model_path = "/kaggle/working/base_model"
    model_path = "/content/base_model"

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(model_path)
        print("Loaded model from local directory...")
    except:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print("Downloaded and saved model...")

    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

# Fine-tuning function
def fine_tune_LLM(optuna_params, best_eval_loss):
    dataset = load_dataset(chunk_size=1024)

    model, tokenizer = load_model()  # load model and tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Print word count for train and test datasets
    print(f"Train dataset word count: {sum(len(example['text'].split()) for example in dataset['train'])}, Test dataset word count: {sum(len(example['text'].split()) for example in dataset['test'])}")

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048, return_tensors="pt")

    tokenized_data = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    model.train()
    model = prepare_model_for_kbit_training(model)

    peft_config = LoraConfig(
        r=optuna_params["lora_r"],
        lora_alpha=optuna_params["lora_alpha"],
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
        lora_dropout=optuna_params["lora_dropout"],
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    tokenizer.pad_token = tokenizer.eos_token
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    training_args = TrainingArguments(
        # output_dir="/kaggle/working/fine_tuned_model",
        output_dir="/content/fine_tuned_model",
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False},
        overwrite_output_dir=True,
        learning_rate=optuna_params["lr"],
        per_device_train_batch_size=optuna_params["batch_size"],
        per_device_eval_batch_size=optuna_params["batch_size"],
        num_train_epochs=optuna_params["num_epochs"],
        weight_decay=optuna_params["weight_decay"],
        logging_dir="kaggle/working/logs",  # TensorBoard logs directory
        logging_strategy="epoch",  # Log at the end of each epoch
        eval_strategy="epoch",
        save_strategy="epoch",
        gradient_accumulation_steps=optuna_params["gradient_accumulation_steps"],
        warmup_steps=500,
        fp16=True,
        optim="paged_adamw_8bit",
    )

    # Create Trainer with TensorBoard callback
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        args=training_args,
        data_collator=data_collator,
        callbacks=[EpochTimeTracker()]  # Include TensorBoard callback here
    )

    print("\nBeginning to train model...")
    trainer.train()

    eval_results = trainer.evaluate()
    eval_loss = eval_results["eval_loss"]

    # Save best model
    print("best eval loss", best_eval_loss)
    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        # Save the best model
        # model_path = "/kaggle/working/best_model"
        model_path = "/content/best_model"
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print(f"New best model saved with eval_loss: {eval_loss}")

        # Save trainer state to a text file
        trainer_state_file = os.path.join(model_path, "trainer_state.txt")
        with open(trainer_state_file, "w") as f:
            f.write(str(trainer.state))

    # Clear GPU
    del model
    del tokenizer
    del trainer
    del dataset
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()
    gc.collect()

    return eval_results["eval_loss"]

def objective(trial):
    global best_eval_loss

    optuna_params = {
    "lr": trial.suggest_float("lr", 5e-5, 1e-1, log=True),
    "batch_size": 2,
    "num_epochs": 10,
    "lora_r": trial.suggest_int("lora_r", 2, 64),
    "lora_alpha": trial.suggest_int("lora_alpha", 8, 128),
    "lora_dropout": trial.suggest_float("lora_dropout", 0.0, 0.5),
    "weight_decay": trial.suggest_float("weight_decay", 1e-6, 0.1, log=True),
    "gradient_accumulation_steps": trial.suggest_int("gradient_accumulation_steps", 8, 32)
    }

    # Print hyperparameters for this trial
    print("\n\n\n")
    print("Starting trial with hyperparameters:")
    print(f"Learning Rate: {optuna_params['lr']}")
    print(f"Batch Size: {optuna_params['batch_size']}")
    print(f"Number of Epochs: {optuna_params['num_epochs']}")
    print(f"Lora Rank (r): {optuna_params['lora_r']}")
    print(f"Lora Alpha: {optuna_params['lora_alpha']}")
    print(f"Lora Dropout: {optuna_params['lora_dropout']}")
    print(f"Weight Decay: {optuna_params['weight_decay']}")
    print(f"Gradient Accumulation Steps: {optuna_params['gradient_accumulation_steps']}")

    eval_loss = fine_tune_LLM(optuna_params, best_eval_loss)

    # update best eval_loss
    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss

    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(10)
    !nvidia-smi
    time.sleep(5)

    return eval_loss


#_______________ ### MAIN ### _______________
best_eval_loss = float("inf")  # Initialize to infinity

# Run Optuna
optuna.logging.set_verbosity(optuna.logging.INFO)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print(f"Best trial: {study.best_trial}, Best Value: {study.best_value}")

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

[I 2025-03-07 16:00:10,723] A new study created in memory with name: no-name-76144847-b4c8-4bd7-bced-dd22d37b8cf9






Starting trial with hyperparameters:
Learning Rate: 7.25411784475576e-05
Batch Size: 2
Number of Epochs: 10
Lora Rank (r): 45
Lora Alpha: 45
Lora Dropout: 0.4769870635970819
Weight Decay: 0.0020294047650912354
Gradient Accumulation Steps: 13
Word count:  61839


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Downloaded and saved model...
Train dataset word count: 49692, Test dataset word count: 12337


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

ERROR:bitsandbytes.cextension:Could not load bitsandbytes native library: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 454, in LoadLibrary
    return self._dlltype(name)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4

trainable params: 31,703,040 || all params: 1,267,517,440 || trainable%: 2.5012





Beginning to train model...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the fine-tuned model and tokenizer
# model_path = "/kaggle/working/best_model"
model_path = "/content/best_model"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

def generate_response(prompt, max_length=1024, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example query
prompt = "Where do bucks like to bed?"
response = generate_response(prompt)
print(response)