In [1]:
!pip install -q --upgrade huggingface_hub transformers
!pip install -q accelerate peft trl bitsandbytes datasets wandb

[0m

In [2]:
SEED = 2024
BLOCK_SIZE = 256

MODEL_NAME = "google/gemma-2-2b"
NEW_MODEL_NAME = "vi-gemma-2-2b"
DATASET_NAME = "vietgpt/wikipedia_vi"
HF_TOKEN_READ = "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"
HF_TOKEN_WRITE = "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"
WANDB_TOKEN = "02b82f496321becca227a522b17fe7b965d7e20b"
OUTPUT_DIR = "./outputs"

In [3]:
from huggingface_hub import login
import wandb
login(HF_TOKEN_READ)
wandb.login(key = WANDB_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Step 1: Load and preprocess dataset

## 1.1: Load dataset

In [4]:
from datasets import load_dataset
dataset = load_dataset(DATASET_NAME, split = "all").shuffle(seed = SEED).select(range(200_000))
print(dataset)

README.md:   0%|          | 0.00/632 [00:00<?, ?B/s]

(…)-00000-of-00003-6218d2963e302058.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

(…)-00001-of-00003-12e6c4fadbec91d4.parquet:   0%|          | 0.00/55.2M [00:00<?, ?B/s]

(…)-00002-of-00003-175fcfe1c45b0b85.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1284930 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'revid', 'url', 'title', 'text'],
    num_rows: 200000
})


## 1.2: Preprocess dataset

In [5]:
from random import sample
raw_text = dataset["text"]
sample(raw_text, 10)

['Diplazium polyrhizon là một loài dương xỉ trong họ Athyriaceae. Loài này được Sledge mô tả khoa học đầu tiên năm 1962.\nDanh pháp khoa học của loài này chưa được làm sáng tỏ. ',
 'Mai Chửng (1940 - 2001), tên thật Nguyễn Mai Chửng, là một điêu khắc gia có nhiều tác phẩm theo trường phái hiện đại được chọn bày nơi công cộng. Tuy nhiên, nhiều tác phẩm của ông đã bị phá hủy.\nÔng sinh tại Bình Định và theo học Cao đẳng Mỹ thuật Huế với điêu khắc gia Lê Ngọc Huệ mới từ Paris về. Tốt nghiệp Cao đẳng Mỹ thuật Huế năm 1961, ông tiếp tục học tại Quốc gia Cao đẳng Mỹ thuật ở Sài Gòn, tốt nghiệp năm 1963.\nNăm 1968, ông bắt đầu dạy ở trường Quốc gia Cao đẳng Mỹ thuật và năm 1974 tại Đại học Kiến trúc Sài Gòn. Năm 1975 ông bị bắt học tập cải tạo, được thả cuối năm 1978.\nÔng vượt biên và tới Mỹ năm 1981, sống ở Hawaii và sau đó Texas. Tại Texas, ông thực hiện một loạt tác phẩm điêu khắc bằng kim loại và bằng đất nung. Tháng 7 năm 2001, ông tham gia một cuộc triển lãm lớn của các họa sĩ, điêu kh

In [6]:
import re
def remove_html(text):
    return re.sub(r'<[^>]*>', '', text)

In [7]:
# Vietnamese Unicode Normalize
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"

def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split('|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split('|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic

dicchar = loaddicchar()

In [8]:
def convert_unicode(text):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], text)

In [9]:
# Get Vietnamese stopwords
def get_stopwords_list(stop_file_path):
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

stopwords_path = "./vietnamese-stopwords.txt"
stopwords = get_stopwords_list(stopwords_path)
print(f"Total number of stopwords: {len(stopwords)}")

Total number of stopwords: 1942


In [10]:
def remove_stop_words(sentence_list):
    for i in range(len(sentence_list)):
        word_tokens = sentence_list[i].split(" ")
        sentence_list[i] = " ".join([word for word in word_tokens if word not in stopwords])
    return sentence_list

In [11]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [12]:
import string
def remove_punctuation(text):
    PUNCT_TO_REMOVE = string.punctuation
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [13]:
raw_text = [remove_urls(text) for text in raw_text]
raw_text = [remove_html(text) for text in raw_text]
raw_text = [text.replace("\n", " ") for text in raw_text]
raw_text = [text.replace("_", " ").strip() for text in raw_text]
raw_text = [convert_unicode(text) for text in raw_text]
raw_text = [remove_punctuation(text) for text in raw_text]
raw_text = remove_stop_words(raw_text)
raw_text = [text.lower() for text in raw_text]

In [14]:
sample(raw_text, 10)

['điện mặt trời bmt máy điện mặt trời xây dựng đất xã ea phê krông búk huyện krông pắc tỉnh đắk lắk việt nam điện mặt trời bmt công suất lắp máy 30 mwp khởi công 8 2018 khánh thành 4 2019 diện tích 345 ha chân đập krông búk hạ điện bình quân hàng 44 mwhnăm',
 'plaveč làng huyện znojmo jihomoravský cộng hòa séc',
 'lee joongi hangul이준기 sinh 17 4 1982 nam diễn viên mẫu ca sĩ hàn quốc lee joongi bắt đầu nổi tiếng vai diễn gonggil nhà vua chàng hề kể tham gia phim truyền hình my girl 2005 time between dog and wolf 2007 huyền thoại iljimae 2008 arang sử đạo truyện 2012 two weeks 2013 gunman in joseon 2014 thư sinh bóng đêm 2015  2016 hành vi phạm tội 2017 luật sư vô pháp 2018 hoa quỷ 2020 again my life 2022 năm 2008 lee joongi tổ chức du lịch hàn quốc đại sứ du lịch quảng bá làn sóng hallyu vươn toàn châu á tiểu sử lee joongi sinh 17 4 1982 busan hàn quốc sau changwon sinh sống học tập khi lee joongi ước mơ trở thành lập trình viên máy lần đầu tiên cảm hứng thú nghệ thuật kịch ‘hamlet’ thời

In [15]:
from datasets import Dataset
dataset = Dataset.from_dict({"text": raw_text})
dataset = dataset.flatten()
dataset = dataset.train_test_split(test_size = 0.2, seed = SEED)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 160000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 40000
    })
})


In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_eos_token = True

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [17]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

tokenized_dataset = dataset.map(
    preprocess_function,
    batched = True,
    num_proc = 4,
    remove_columns = dataset["train"].column_names,
)

print(tokenized_dataset)

Map (num_proc=4):   0%|          | 0/160000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 160000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 40000
    })
})


In [18]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k],[]) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    """
    Ensures that the final dataset consists of chunks of exactly BLOCK_SIZE tokens,
    by trimming off any remainder that doesn't fit into a full block.
    """
    if total_length >= BLOCK_SIZE:
        total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE

    # Split by chunks of BLOCK_SIZE.
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched = True, num_proc = 4)
print(lm_dataset)

Map (num_proc=4):   0%|          | 0/160000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 312288
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 75673
    })
})


In [19]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False, pad_to_multiple_of = 8)

# Step 2: Preparing and Training model

## 2.1 Load model and create PEFT Model

In [20]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype = torch.bfloat16,
    attn_implementation = 'eager',
    device_map = device
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [21]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    # cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:
        lora_module_names.remove("lm_head")
    return list(lora_module_names)

In [22]:
from peft import LoraConfig, get_peft_model
peft_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = find_all_linear_names(model),
    use_rslora = True,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 41,533,440 || all params: 2,655,875,328 || trainable%: 1.5638


## 2.2 Training model

In [23]:
training_arguments = TrainingArguments(
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 2,
    num_train_epochs = 1,
    warmup_steps = 10,
    max_steps = -1,
    logging_steps = 1,
    save_steps = 0.1,
    eval_steps = 0.1,

    learning_rate = 2e-5,
    weight_decay = 0.1,

    fp16 = False,
    bf16 = False,
    load_best_model_at_end = True,
    remove_unused_columns = False,

    eval_strategy = "steps",
    save_strategy = "steps",
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    report_to = "wandb",
    run_name = "Fine-tune-Gemma2-2B",
    output_dir = OUTPUT_DIR,
)

trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = lm_dataset["train"],
    eval_dataset = lm_dataset["test"],
    data_collator = data_collator,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
import math
#evaluate the baseline model
initial_results = trainer.evaluate()
print(initial_results)
print(f"Baseline {MODEL_NAME} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")

{'eval_loss': 4.147630214691162, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 1578.2402, 'eval_samples_per_second': 47.948, 'eval_steps_per_second': 2.997}
Baseline google/gemma-2-2b Results: Perplexity: 63.28


In [25]:
import os

# fine-tune the pre-trained model with Vietnamese dataset
for _ in range(1_000_000): torch.cuda.empty_cache()
trainer.train()

#evaluate the fine-tuned model
eval_results = trainer.evaluate()
print(f"Fine-tuned {NEW_MODEL_NAME} Results:{eval_results}\n")

perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity
print(f"Fine-tuned {NEW_MODEL_NAME} Results: Perplexity: {perplexity:.2f}")

os.makedirs(NEW_MODEL_NAME, exist_ok=True)
trainer.save_model(NEW_MODEL_NAME)

wandb.finish()
model.config.use_cache = True

Step,Training Loss,Validation Loss,Model Preparation Time
976,1.1085,1.135606,0.0167
1952,1.0959,1.066904,0.0167
2928,0.9739,1.028366,0.0167
3904,1.0178,1.004256,0.0167
4880,0.9982,0.986322,0.0167
5856,0.9846,0.973348,0.0167
6832,0.873,0.96363,0.0167
7808,1.0137,0.957626,0.0167
8784,0.9191,0.954655,0.0167


Fine-tuned vi-gemma-2-2b Results:{'eval_loss': 0.9546546339988708, 'eval_model_preparation_time': 0.0167, 'eval_runtime': 1580.1615, 'eval_samples_per_second': 47.889, 'eval_steps_per_second': 2.993, 'epoch': 1.0}

Fine-tuned vi-gemma-2-2b Results: Perplexity: 2.60


VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▁▁▁▁▁▁▁▁▁▁
eval/model_preparation_time,▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▄█▆▅▅▆▅▃██
eval/samples_per_second,█▅▁▃▄▄▃▄▆▁▁
eval/steps_per_second,█▆▁▃▄▄▃▄▆▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇█████
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,█████████▇▇▇▇▇▇▇▆▆▆▅▅▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,██▇▆▄▄▅▅▄▃▃▃▄▃▄▅▂▂▃▂▃▂▁▃▂▃▂▂▂▂▂▂▃▁▂▂▃▃▁▃

0,1
eval/loss,0.95465
eval/model_preparation_time,0.0167
eval/runtime,1580.1615
eval/samples_per_second,47.889
eval/steps_per_second,2.993
total_flos,9.910318650139608e+17
train/epoch,1.0
train/global_step,9759.0
train/grad_norm,2.25184
train/learning_rate,0.0


# Step 3: Merge LoRA and push to HuggingFace

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login

MODEL_NAME = "google/gemma-2-2b"
NEW_MODEL_NAME = "vi-gemma-2-2b"
HF_TOKEN_WRITE = "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"
login(HF_TOKEN_WRITE)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    low_cpu_mem_usage = True,
    return_dict = True,
    torch_dtype = torch.float16,
    device_map = "auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model = PeftModel.from_pretrained(base_model, NEW_MODEL_NAME)
model = model.merge_and_unload()

In [4]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps

In [5]:
model.push_to_hub(NEW_MODEL_NAME, use_temp_dir = False)
tokenizer.push_to_hub(NEW_MODEL_NAME, use_temp_dir = False)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: cfd572fa-9bc3-4c19-b217-8122fb95e5a8)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/ca/3c/ca3cd35954eba5330279581bde6a959aa77bca8f1edc71fb296584fc27da3372/4e6bc91105f9aa76daf908b5d3a36cbef71f816bfdea09b6e60cc0b6ecbbb15e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20241013%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241013T024807Z&X-Amz-Expires=86400&X-Amz-Signature=61e17f2122fd17176a1abc1d22b848d85d02587908ff9ff88fb78f5331957e5d&X-Amz-SignedHeaders=host&partNumber=3&uploadId=8_48krHqE26TznG9v5uOFUbyMPHg2D4p7jSOWxIkb.Q70F2bY_0oTCLHzGMAgkY9Eh2po10zIxDucLazAz03uA_BdZ38HkKpEOJUk9HcOWh1wj82M3fMQ2poHmTMh2lw&x-id=UploadPart
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hamana0509/vi-gemma-2-2b/commit/e440b386f921c95145624dfa507033f462f2e559', commit_message='Upload tokenizer', commit_description='', oid='e440b386f921c95145624dfa507033f462f2e559', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Hamana0509/vi-gemma-2-2b', endpoint='https://huggingface.co', repo_type='model', repo_id='Hamana0509/vi-gemma-2-2b'), pr_revision=None, pr_num=None)