In [5]:
from project import create_aligned_corpus, align, map_tokens, smooth_mapping, remap_model
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

source_model = "meta-llama/Meta-Llama-3-8B"

target_tokenizer = "meta-llama/Meta-Llama-3-8B"
export_dir = "en-hi-llama3-8b"

corpus = create_aligned_corpus(
    source_language="en",
    target_language="hi",
    source_tokenizer=source_model,
    target_tokenizer=target_tokenizer,
)

mapped_tokens_file = align(corpus, fast_align_path="fast_align/build/fast_align")

tokenized_possible_translations, untokenized_possible_translations = map_tokens(mapped_tokens_file, source_model, target_tokenizer)

smoothed_mapping = smooth_mapping(target_tokenizer, tokenized_possible_translations)

model = remap_model(source_model, target_tokenizer, smoothed_mapping, source_model)
os.makedirs(export_dir, exist_ok=False)
new_tokenizer = AutoTokenizer.from_pretrained(target_tokenizer)
model.save_pretrained(export_dir)
new_tokenizer.save_pretrained(export_dir)

data already aligned
corpus already aligned
Mapping tokens


100%|██████████| 4532830/4532830 [00:05<00:00, 783953.01it/s]


Number of tokens with a translation: 128256
Number of new tokens: 128256
Percentage of tokens with a translation: 100.0%


100%|██████████| 128256/128256 [00:00<00:00, 1622876.63it/s]


Loading the source model...


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Remapping the model...


100%|██████████| 128256/128256 [00:10<00:00, 12167.02it/s]


[2024-08-19 21:29:11,021] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


('en-hi-llama3-8b/tokenizer_config.json',
 'en-hi-llama3-8b/special_tokens_map.json',
 'en-hi-llama3-8b/tokenizer.json')

In [6]:
import torch
torch.save(model.state_dict(), os.path.join(export_dir, "pytorch_model.bin"))
model.config.save_pretrained(export_dir)
new_tokenizer.save_pretrained(export_dir)

('en-hi-llama3-8b/tokenizer_config.json',
 'en-hi-llama3-8b/special_tokens_map.json',
 'en-hi-llama3-8b/tokenizer.json')

In [8]:
# Push to Hugging Face Hub
import huggingface_hub as hf_hub
# basemodel_path = "en-hi-llama3-8b"
# loramodel_path = "danwils/mala-alpaca"
output_model_name = "en-hi-llama3-8b"
# hf_token = "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"

repo = hf_hub.create_repo(output_model_name, private=False)  # Set private=False if you want it to be public
hf_hub.upload_folder(
    folder_path=output_model_name,
    path_in_repo='subhrokomol/Meta-Llama-3-8B-Hindi',  # Root of the repo
    repo_id=f"{hf_hub.whoami()['name']}/{output_model_name}"
)

  0%|          | 0/8 [00:00<?, ?it/s]

(…)B-Hindi/model-00001-of-00007.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

(…)B-Hindi/model-00002-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

(…)B-Hindi/model-00003-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

(…)B-Hindi/model-00004-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

(…)B-Hindi/model-00005-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

(…)B-Hindi/model-00006-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

(…)B-Hindi/model-00007-of-00007.safetensors:   0%|          | 0.00/2.57G [00:00<?, ?B/s]

(…)/Meta-Llama-3-8B-Hindi/pytorch_model.bin:   0%|          | 0.00/32.1G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/subhrokomol/en-hi-llama3-8b/commit/0ce6b5b6763db44c4308ffb94b5c826455aeef3e', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0ce6b5b6763db44c4308ffb94b5c826455aeef3e', pr_url=None, pr_revision=None, pr_num=None)