In [1]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install wandb

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.7.6-py3-none-any.whl.metadata (8.1 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.19.1-py3-none-any.whl (376 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from unsloth import FastLanguageModel
from peft import PeftModel
import wandb
import os
wandb.login()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33misraahamieh[0m ([33misraahamieh-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
WANDB_ENTITY = "israahamieh-"
WANDB_PROJECT = "domain-name-finetuning"
ARTIFACT_NAME = "domain-model-full_dataset_higher_capacity_lora:latest"

# Directory to download the W&B artifact to temporarily
DOWNLOAD_PATH = "./downloaded_lora_artifact"
# Directory where the MERGED model will be saved
MERGED_MODEL_SAVE_PATH = "./merged_finetuned_model"
BASE_MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

print("Initializing W&B and downloading LoRA artifact...")
wandb.init(project=WANDB_PROJECT)
artifact = wandb.use_artifact(ARTIFACT_NAME)
download_dir = artifact.download(root=DOWNLOAD_PATH)
print(f"✅ Model downloaded to: {download_dir}")
wandb.finish()

Initializing W&B and downloading LoRA artifact...


[34m[1mwandb[0m: Downloading large artifact domain-model-full_dataset_higher_capacity_lora:latest, 336.53MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:2.3 (148.7MB/s)


✅ Model downloaded to: ./downloaded_lora_artifact


In [4]:
# --- 2. Load the Base Model and LoRA Adapters ---
print(f"Loading base model '{BASE_MODEL_NAME}' and LoRA adapters from '{DOWNLOAD_PATH}'...")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = DOWNLOAD_PATH, # Load the original base model
        max_seq_length = None, # No specific max_seq_length needed for loading
        dtype = None, # Use bfloat16 if possible
        load_in_4bit = True
    )

    print("Base model and LoRA adapters loaded.")
except Exception as e:
    print(f"Error loading model or adapters: {e}")
    print("Ensure CUDA is available if using 4-bit loading, and artifact structure is correct.")
    exit(1)


Loading base model 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit' and LoRA adapters from './downloaded_lora_artifact'...
==((====))==  Unsloth 2025.7.4: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Unsloth 2025.7.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Base model and LoRA adapters loaded.


In [5]:
# --- 3. Merge LoRA Adapters into the Base Model ---
print(f"Merging LoRA adapters into the base model and saving to '{MERGED_MODEL_SAVE_PATH}'...")
try:
    model.save_pretrained_merged(
        MERGED_MODEL_SAVE_PATH,
        tokenizer,
        save_method = "merged_16bit"
    )
    print("Model merged and saved successfully!")

    # Optional: Verify the saved directory contents
    print("\nContents of merged model directory:")
    os.system(f"ls -lh {MERGED_MODEL_SAVE_PATH}")

except Exception as e:
    print(f"Error merging or saving model: {e}")
    exit(1)

print(f"Your merged model is ready at: {MERGED_MODEL_SAVE_PATH}")

Merging LoRA adapters into the base model and saving to './merged_finetuned_model'...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/Meta-Llama-3.1-8B-Instruct...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:35<01:46, 35.60s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:22<01:24, 42.46s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [02:11<00:45, 45.18s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [02:20<00:00, 35.17s/it]


Model merged and saved successfully!

Contents of merged model directory:
Your merged model is ready at: ./merged_finetuned_model


In [6]:
%pip install huggingface_hub



In [7]:
from huggingface_hub import HfApi, notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
api = HfApi()
HF_USER="IsraaH"
HF_MODEL_NAME="domain-name-finetuned-model"
repo_id = f"{HF_USER}/{HF_MODEL_NAME}"
local_dir = "./merged_finetuned_model"

api.create_repo(
        repo_id=repo_id,
        repo_type="model",
        private=False,
        exist_ok=True
    )

print(f"Uploading model to Hugging Face Hub: {repo_id} from {local_dir}")
api.upload_folder(
    folder_path=local_dir,
    repo_id=repo_id,
    repo_type="model",
)
print("Model uploaded successfully!")

Uploading model to Hugging Face Hub: IsraaH/domain-name-finetuned-model from ./merged_finetuned_model


  0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Model uploaded successfully!
