In [10]:
from safetensors.torch import load_file
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from sagemaker.pytorch import PyTorch

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
import unsloth

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Loading the Finetuned Model
SAVED_MODEL is the lora huggingface repo of the model that has been fine-tuned. INFER_MODEL is the huggingface repo of the model that has the lora weights merged with the base weights for inferring.

In [5]:
SAVED_MODEL = "Alexis-Az/Story-Generation-LlaMA-3.1-8B-10k"
INFER_MODEL= "Alexis-Az/Story-Generation-Model"
max_seq_length = 1024

In [None]:
adapter_model, tokenizer = unsloth.FastLanguageModel.from_pretrained(SAVED_MODEL, load_in_4bit=True)

In [5]:
adapter_model.save_pretrained_merged("Story-Generation-LlaMA-3.1-8B-10k", tokenizer)

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 7.74 out of 15.43 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 78%|███████▊  | 25/32 [00:00<00:00, 34.04it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:04<00:00,  7.35it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Story-Generation-LlaMA-3.1-8B-10k/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Story-Generation-LlaMA-3.1-8B-10k/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Story-Generation-LlaMA-3.1-8B-10k/pytorch_model-00003-of-00004.bin...
Unsloth: Saving Story-Generation-LlaMA-3.1-8B-10k/pytorch_model-00004-of-00004.bin...
Done.


In [1]:
!ls

'Finetuning Story Generation Model.ipynb'   hf_auth.ipynb
 README.md				    inference_container
 Story-Generation-LlaMA-3.1-8B-10k	    load_data.ipynb
 feature_engineering.ipynb		    unsloth_compiled_cache
 format_pth_model.ipynb


## Saving the Merged Model 
The model will be pushed to the INFER_MODEL huggingface repo, and will also be saved as a model .pth file for use in the container for inferrence.

In [14]:
pth_model = AutoModelForCausalLM.from_pretrained("./Story-Generation-LlaMA-3.1-8B-10k")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("./Story-Generation-LlaMA-3.1-8B-10k")

In [8]:
pth_model.push_to_hub(
    INFER_MODEL,
    tokenizer=tokenizer,
    safe_serialization=True,
    create_pr=True,
    max_shard_size="3GB",
)

Upload 12 LFS files:   0%|          | 0/12 [00:00<?, ?it/s]

model-00002-of-00012.safetensors:   0%|          | 0.00/2.79G [00:00<?, ?B/s]

model-00004-of-00012.safetensors:   0%|          | 0.00/2.85G [00:00<?, ?B/s]

model-00001-of-00012.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00003-of-00012.safetensors:   0%|          | 0.00/2.85G [00:00<?, ?B/s]

model-00005-of-00012.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

model-00006-of-00012.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00007-of-00012.safetensors:   0%|          | 0.00/2.85G [00:00<?, ?B/s]

model-00008-of-00012.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

model-00009-of-00012.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

model-00010-of-00012.safetensors:   0%|          | 0.00/2.85G [00:00<?, ?B/s]

model-00011-of-00012.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model-00012-of-00012.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Alexis-Az/Story-Generation-Model/commit/90ade79e95db7467f65c34e0cdab9863e95d7415', commit_message='Upload LlamaForCausalLM', commit_description='', oid='90ade79e95db7467f65c34e0cdab9863e95d7415', pr_url='https://huggingface.co/Alexis-Az/Story-Generation-Model/discussions/1', repo_url=RepoUrl('https://huggingface.co/Alexis-Az/Story-Generation-Model', endpoint='https://huggingface.co', repo_type='model', repo_id='Alexis-Az/Story-Generation-Model'), pr_revision='refs/pr/1', pr_num=1)

## Saving Model Artifacts Locally

In [23]:
pt_path = './Story-Generation-LlaMA.pt'

In [26]:
pth_model.save_pretrained(pt_path)