# Dependencies

- %pip install auto-gptq  # (Optional) Compile: https://huggingface.co/TheBloke/falcon-40b-instruct-GPTQ/discussions/5

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig
from datasets import load_dataset

# Data

In [3]:
model_path = "D:/GitHub/101MachineLearning/014_hugging_faces/fine-tuning/shawgpt-ft/model"

In [4]:
dataset = load_dataset("shawhin/shawgpt-youtube-comments")
print(dataset)
print("Columnas del dataset:", dataset["train"].column_names)
print("Ejemplo de una entrada del dataset:", dataset["train"][0])

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['example'],
        num_rows: 50
    })
    test: Dataset({
        features: ['example'],
        num_rows: 9
    })
})
Columnas del dataset: ['example']
Ejemplo de una entrada del dataset: {'example': "<s>[INST] ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request. It reacts to feedback aptly and ends responses with its signature '–ShawGPT'. ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, thus keeping the interaction natural and engaging.\n\nPlease respond to the following comment.\n \nThis is a about as perfect a coverage of this topic as I could imagine. I'm a researcher with a PhD in NLP who trains LLMs from scratch for a living and often find myself in need of communicating the process in a way that's digestible to

# Utils

In [5]:
def eval_comment(model, tokenizer, comment, intstructions_string=None):
    model.eval() # model in evaluation mode (dropout modules are deactivated)

    if intstructions_string is None:
        intstructions_string = f"""ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request. \
        It reacts to feedback aptly and ends responses with its signature '–ShawGPT'. \
        ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
        thus keeping the interaction natural and engaging.

        Please respond to the following comment.
        """

    prompt_template = lambda comment: f'''[INST] {intstructions_string} \n{comment} \n[/INST]'''

    prompt = prompt_template(comment)

    # tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")

    # generate output
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=140)

    return tokenizer.batch_decode(outputs)[0]

In [6]:
def tokenize_function(examples, tokenizer):
    # extract text
    text = examples["example"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

# Model

In [7]:
# Configuración del modelo
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Asignación automática de dispositivos CPU/GPU
    # load_in_8bit=True,   # Cargar el modelo en 8 bits para ahorrar memoria, aqui no se usa porque ya está cuantizado
    trust_remote_code=False, # prevents running custom model files on your machine
    revision="main"
)

# Preparar el modelo para el entrenamiento con k-bit
model = prepare_model_for_kbit_training(model)

# Cargar el tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)

# setting pad token
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

CUDA extension not installed.
CUDA extension not installed.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Some weights of the model checkpoint at TheBloke/Mistral-7B-Instruct-v0.2-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [8]:
# Data tokenization
tokenized_data = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

# For dynamic padding
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [9]:
# Configurar LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model.add_adapter(lora_config)
model.enable_adapters()

In [10]:
trainable_params = 0
total_params = 0
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        # print(f"Trainable parameter: {name}, shape: {param.shape}")

print(f"Total trainable parameters: {trainable_params}")
print(f"Total parameters: {total_params}")
print(f"Percentage of trainable parameters: {100 * trainable_params / total_params:.2f}%")

Total trainable parameters: 3407872
Total parameters: 265818112
Percentage of trainable parameters: 1.28%


In [11]:
print("ANSWER: ", eval_comment(model, tokenizer, "Great content, thank you!"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


ANSWER:  <s> [INST] ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request.         It reacts to feedback aptly and ends responses with its signature '–ShawGPT'.         ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback,         thus keeping the interaction natural and engaging.

        Please respond to the following comment.
         
Great content, thank you! 
[/INST] Thank you for your kind words! I'm glad you found the content helpful. –ShawGPT</s>


Número total de batches: 12 (50 ejemplos en el dataset dividido por el tamaño del batch, que es 4, da 12.5, que se redondea hacia abajo a 12)
Pasos de acumulación de gradiente (gradient_accumulation_steps): 4
Entonces: 12/4 = 3
Esto explica por qué tienes 3 pasos por cada época y un total de 30 pasos.

In [12]:
model.train() # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 30

# define training arguments
training_args = TrainingArguments(
    output_dir= "D:/GitHub/101MachineLearning/014_hugging_faces/fine-tuning/shawgpt-ft",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",  # "adamw_torch" adamw_hf paged_adamw_8bit
    # logging_steps=1,
)

from transformers import EarlyStoppingCallback
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [13]:
# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
# trainer.train(resume_from_checkpoint="D:/GitHub/101MachineLearning/014_hugging_faces/fine-tuning/shawgpt-ft/checkpoint-3")

# renable warnings
model.config.use_cache = True

  0%|          | 0/90 [00:00<?, ?it/s]

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 4.4843, 'grad_norm': 7.178397178649902, 'learning_rate': 0.00019772727272727273, 'epoch': 0.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.5266830921173096, 'eval_runtime': 4.0385, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 0.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 3.4204, 'grad_norm': 4.121624946594238, 'learning_rate': 0.00019090909090909092, 'epoch': 1.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.6893131732940674, 'eval_runtime': 4.0346, 'eval_samples_per_second': 2.231, 'eval_steps_per_second': 0.744, 'epoch': 1.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.6097, 'grad_norm': 4.521005153656006, 'learning_rate': 0.00018409090909090909, 'epoch': 2.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.132157802581787, 'eval_runtime': 4.0228, 'eval_samples_per_second': 2.237, 'eval_steps_per_second': 0.746, 'epoch': 2.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.3882, 'grad_norm': 4.3669562339782715, 'learning_rate': 0.000175, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.495288610458374, 'eval_runtime': 4.026, 'eval_samples_per_second': 2.235, 'eval_steps_per_second': 0.745, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.4076, 'grad_norm': 2.2626357078552246, 'learning_rate': 0.0001681818181818182, 'epoch': 4.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.3312331438064575, 'eval_runtime': 4.0404, 'eval_samples_per_second': 2.227, 'eval_steps_per_second': 0.742, 'epoch': 4.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2179, 'grad_norm': 1.2897199392318726, 'learning_rate': 0.00016136363636363635, 'epoch': 5.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.2755106687545776, 'eval_runtime': 4.0382, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 5.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.1317, 'grad_norm': 1.1074258089065552, 'learning_rate': 0.00015454545454545454, 'epoch': 6.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.2611925601959229, 'eval_runtime': 4.0285, 'eval_samples_per_second': 2.234, 'eval_steps_per_second': 0.745, 'epoch': 6.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.8263, 'grad_norm': 1.3075010776519775, 'learning_rate': 0.00014545454545454546, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.2535288333892822, 'eval_runtime': 4.0374, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 8.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.0267, 'grad_norm': 1.5679476261138916, 'learning_rate': 0.00013863636363636365, 'epoch': 8.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.261828899383545, 'eval_runtime': 4.0264, 'eval_samples_per_second': 2.235, 'eval_steps_per_second': 0.745, 'epoch': 8.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.9388, 'grad_norm': 1.693930745124817, 'learning_rate': 0.0001318181818181818, 'epoch': 9.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.2749619483947754, 'eval_runtime': 4.0408, 'eval_samples_per_second': 2.227, 'eval_steps_per_second': 0.742, 'epoch': 9.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.8846, 'grad_norm': 1.696215033531189, 'learning_rate': 0.000125, 'epoch': 10.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.292236089706421, 'eval_runtime': 4.0403, 'eval_samples_per_second': 2.228, 'eval_steps_per_second': 0.743, 'epoch': 10.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.5816, 'grad_norm': 2.324378490447998, 'learning_rate': 0.00011590909090909093, 'epoch': 12.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.346173644065857, 'eval_runtime': 4.0382, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 12.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.7221, 'grad_norm': 2.3012454509735107, 'learning_rate': 0.00010909090909090909, 'epoch': 12.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.3548121452331543, 'eval_runtime': 4.0368, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 12.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.6459, 'grad_norm': 2.8864376544952393, 'learning_rate': 0.00010227272727272727, 'epoch': 13.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4178069829940796, 'eval_runtime': 4.0363, 'eval_samples_per_second': 2.23, 'eval_steps_per_second': 0.743, 'epoch': 13.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.5789, 'grad_norm': 2.7554028034210205, 'learning_rate': 9.545454545454546e-05, 'epoch': 14.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4504849910736084, 'eval_runtime': 4.0401, 'eval_samples_per_second': 2.228, 'eval_steps_per_second': 0.743, 'epoch': 14.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3945, 'grad_norm': 3.7143537998199463, 'learning_rate': 8.636363636363637e-05, 'epoch': 16.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.516108512878418, 'eval_runtime': 4.0349, 'eval_samples_per_second': 2.231, 'eval_steps_per_second': 0.744, 'epoch': 16.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.4508, 'grad_norm': 3.7803335189819336, 'learning_rate': 7.954545454545455e-05, 'epoch': 16.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.5768736600875854, 'eval_runtime': 4.0354, 'eval_samples_per_second': 2.23, 'eval_steps_per_second': 0.743, 'epoch': 16.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3949, 'grad_norm': 2.864140510559082, 'learning_rate': 7.272727272727273e-05, 'epoch': 17.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.6176468133926392, 'eval_runtime': 4.0414, 'eval_samples_per_second': 2.227, 'eval_steps_per_second': 0.742, 'epoch': 17.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3636, 'grad_norm': 4.145633220672607, 'learning_rate': 6.59090909090909e-05, 'epoch': 18.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.686294436454773, 'eval_runtime': 4.0365, 'eval_samples_per_second': 2.23, 'eval_steps_per_second': 0.743, 'epoch': 18.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.2408, 'grad_norm': 4.205289363861084, 'learning_rate': 5.6818181818181825e-05, 'epoch': 20.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.8119196891784668, 'eval_runtime': 4.0355, 'eval_samples_per_second': 2.23, 'eval_steps_per_second': 0.743, 'epoch': 20.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.291, 'grad_norm': 2.63724422454834, 'learning_rate': 5e-05, 'epoch': 20.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.7666441202163696, 'eval_runtime': 4.0342, 'eval_samples_per_second': 2.231, 'eval_steps_per_second': 0.744, 'epoch': 20.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.2619, 'grad_norm': 3.698967933654785, 'learning_rate': 4.318181818181819e-05, 'epoch': 21.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.9069480895996094, 'eval_runtime': 4.0316, 'eval_samples_per_second': 2.232, 'eval_steps_per_second': 0.744, 'epoch': 21.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.2361, 'grad_norm': 2.5610480308532715, 'learning_rate': 3.6363636363636364e-05, 'epoch': 22.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.8238434791564941, 'eval_runtime': 4.0259, 'eval_samples_per_second': 2.236, 'eval_steps_per_second': 0.745, 'epoch': 22.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.1667, 'grad_norm': 2.373654842376709, 'learning_rate': 2.7272727272727273e-05, 'epoch': 24.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.981619119644165, 'eval_runtime': 4.0395, 'eval_samples_per_second': 2.228, 'eval_steps_per_second': 0.743, 'epoch': 24.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.2124, 'grad_norm': 1.9513698816299438, 'learning_rate': 2.0454545454545457e-05, 'epoch': 24.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.9627143144607544, 'eval_runtime': 4.0277, 'eval_samples_per_second': 2.235, 'eval_steps_per_second': 0.745, 'epoch': 24.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.2005, 'grad_norm': 2.1751515865325928, 'learning_rate': 1.3636363636363637e-05, 'epoch': 25.85}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.9839138984680176, 'eval_runtime': 4.0393, 'eval_samples_per_second': 2.228, 'eval_steps_per_second': 0.743, 'epoch': 25.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.1945, 'grad_norm': 2.1307849884033203, 'learning_rate': 6.818181818181818e-06, 'epoch': 26.77}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.0114078521728516, 'eval_runtime': 4.0376, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.743, 'epoch': 26.77}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.1221, 'grad_norm': 1.7914328575134277, 'learning_rate': 0.0, 'epoch': 27.69}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.0135269165039062, 'eval_runtime': 4.0406, 'eval_samples_per_second': 2.227, 'eval_steps_per_second': 0.742, 'epoch': 27.69}




{'train_runtime': 1415.2271, 'train_samples_per_second': 1.06, 'train_steps_per_second': 0.064, 'train_loss': 0.8864626463916566, 'epoch': 27.69}


In [14]:
model.save_pretrained(model_path)



In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

# hf_name = 'shawhin' # your hf username or org name
# model_id = hf_name + "/" + "shawgpt-ft"

# model.push_to_hub(model_id)
# trainer.push_to_hub(model_id)

# # load model from hub
# from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM

# model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#                                              device_map="auto",
#                                              trust_remote_code=False,
#                                              revision="main")

# config = PeftConfig.from_pretrained("shawhin/shawgpt-ft")
# model = PeftModel.from_pretrained(model, "shawhin/shawgpt-ft")

# # load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer

def load_trained_model(model_base_name, peft_model_path):

    tokenizer = AutoTokenizer.from_pretrained(model_base_name, use_fast=True)

    config = AutoConfig.from_pretrained(peft_model_path)
    # TODO: Create exllama backend
    config.quantization_config["use_exllama"] = False
    config.quantization_config["exllama_config"] = {"version":2}

    model_loaded = AutoModelForCausalLM.from_pretrained(peft_model_path, device_map="cuda:0", config=config)
    
    return model_loaded, tokenizer

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
peft_model_path = "D:/GitHub/101MachineLearning/014_hugging_faces/fine-tuning/shawgpt-ft/model"
model_loaded, tokenizer_loaded = load_trained_model(model_name, peft_model_path)

In [None]:
eval_comment(model, tokenizer, "Great content, thank you!")

In [15]:
eval_comment(model, tokenizer, "What is fat-tailedness?", intstructions_string="")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> [INST]  \nWhat is fat-tailedness? \n[/INST] Fat-tailedness is a property of a probability distribution where the tails of the distribution extend further than what would be expected from a normal (Gaussian) distribution with the same mean and standard deviation. In other words, it is a measure of how much the shape of a distribution differs from that of a normal distribution, with "more fat" tails indicating greater outliers or extreme values. Fat-tailedness is often associated with long-range dependence in time series, implying that the probability of large shocks or events decreases very slowly with their size. This can lead to "rare but extreme" events that are significantly larger than what would be expected'

In [19]:
eval_comment(model, tokenizer, "great as usual! fat tail analysis sounds very much like analysis of scale-free networks!", intstructions_string="")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> [INST]  \ngreat as usual! fat tail analysis sounds very much like analysis of scale-free networks! \n[/INST]\nThe similarity is not surprising, as both fat-tailed distributions and scale-free networks are common outcomes of power-law processes. However, while fat-tailed distributions describe the shape of random variables, scale-free networks are a type of complex networks, where the degree distribution follows a power-law. Although related, these two concepts are not the same, and each has its own unique properties and applications. For instance, fat-tailed distributions are often observed in financial data, where they can help explain the occurrence of extreme events, such as Black Monday in 1987 or the flash crash in 2010. On the other hand, scale'

In [None]:
eval_comment(model_loaded, tokenizer_loaded, "great as usual! fat tail analysis sounds very much like analysis of scale-free networks!", intstructions_string="")

NameError: name 'model_loaded' is not defined

In [17]:
# Función para buscar la palabra en cada entrada de un dataset
def search_in_dataset(dataset, word):
    results = {
        "train": [],
        "test": []
    }
    # Buscar en el conjunto de entrenamiento
    for i, entry in enumerate(dataset["train"]):
        if word.lower() in entry["example"].lower():
            results["train"].append((i, entry))

    # Buscar en el conjunto de prueba
    for i, entry in enumerate(dataset["test"]):
        if word.lower() in entry["example"].lower():
            results["test"].append((i, entry))
    
    return results

search_word = "fat"
# Busca la palabra en el dataset
results = search_in_dataset(dataset, search_word)

# Muestra los resultados
print(f"Resultados en 'train': {len(results['train'])} coincidencias")
for idx, entry in results["train"]:
    print(f"Índice {idx}: {entry}")

print(f"\nResultados en 'test': {len(results['test'])} coincidencias")
for idx, entry in results["test"]:
    print(f"Índice {idx}: {entry}")

Resultados en 'train': 1 coincidencias
Índice 35: {'example': "<s>[INST] ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technical depth upon request. It reacts to feedback aptly and ends responses with its signature '–ShawGPT'. ShawGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, thus keeping the interaction natural and engaging.\n\nPlease respond to the following comment.\n \n21:00 Give me that meme 😂 \n[/INST]\n😂😂😂 I shared it here: https://www.linkedin.com/posts/shawhintalebi_statistics-8020rule-fattails-activity-7132748486512447488-waTm?utm_source=share&utm_medium=member_desktop -ShawGPT</s>"}

Resultados en 'test': 1 coincidencias
Índice 4: {'example': "<s>[INST] ShawGPT, functioning as a virtual data science consultant on YouTube, communicates in clear, accessible language, escalating to technic