In [1]:
import pandas as pd

In [38]:
df = pd.read_csv('/kaggle/input/mamabot-file/mamabot_data.csv')

df.dropna(inplace = True)

In [39]:
df.sample(10)

Unnamed: 0,Question,Answer
478,What are some signs that mothers should contac...,Mothers should contact their healthcare provid...
81,What physical changes might mothers notice in ...,Mothers might start to look pregnant as their ...
77,What physical symptoms might mothers experienc...,Mothers might feel more energetic and may noti...
208,How can mothers ensure they are getting enough...,Mothers can ensure adequate protein intake by ...
319,Why is it important for mothers to eat iron-ri...,"Iron-rich foods help generate blood supply, wh..."
326,How can mothers manage occasional headaches du...,Mothers can manage headaches by resting in a d...
248,How can mothers manage morning sickness during...,Mothers can manage morning sickness by eating ...
324,How can mothers manage increased appetite duri...,Mothers can manage increased appetite by eatin...
620,How can mothers manage skin rashes during preg...,Mothers can manage skin rashes by keeping the ...
120,How can mothers ensure they are eating well du...,Mothers can ensure they are eating well by inc...


In [40]:
df.shape

(705, 2)

In [None]:
# Install necessary libraries
!pip install -q transformers datasets huggingface_hub
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q git+https://github.com/huggingface/accelerate.git
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q wandb

In [43]:
# Import libraries
from datasets import Dataset, load_dataset
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,Trainer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,AutoConfig,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os
import time
import torch
from datasets import Dataset
from huggingface_hub import notebook_login, HfFolder
from trl import SFTTrainer,setup_chat_format, SFTConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training

In [44]:
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient


user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token_3")  


# Set the Hugging Face API token as an environment variable
os.environ["HF_HOME"] = "/root/.cache/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface/transformers"
os.environ["HF_DATASETS_CACHE"] = "/root/.cache/huggingface/datasets"
os.environ["HF_METRICS_CACHE"] = "/root/.cache/huggingface/metrics"
os.environ["HF_HUB_TOKEN"] = hf_token

os.environ["HF_HUB_TOKEN"] = hf_token

In [45]:
wandb_key = user_secrets.get_secret("wandbi_api_2")
import wandb
! wandb login $wandb_key

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [46]:
# Define the location and configuration for the model
model_id = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

new_model = 'mamabot-llama-1'

In [47]:
compute_dtype = torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True
)

# Measure time for loading model and tokenizer
time_start = time.time()

# Load the model and tokenizer configurations
model_config = AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)

# Load the model with quantization settings
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    attn_implementation='eager'
)


# Measure and display time taken to load
time_end = time.time()
print(f"Prepare model, tokenizer: {round(time_end - time_start, 3)} sec.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Prepare model, tokenizer: 15.77 sec.


In [48]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set up the chat format
model, tokenizer = setup_chat_format(model, tokenizer)
#model = prepare_model_for_kbit_training(model)

In [49]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [50]:
# Load the dataset
dataset = Dataset.from_pandas(df, split="all")

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Question"]},
               {"role": "assistant", "content": row["Answer"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/705 [00:00<?, ? examples/s]

  self.pid = os.fork()


"<|im_start|>user\nWhat is HelpMum Africa's plan for mothers during their pregnancy journey?<|im_end|>\n<|im_start|>assistant\nHelpMum Africa plans to support mothers throughout their pregnancy journey, from the beginning until the baby is born.  <|im_end|>\n"

In [51]:
dataset['text'][5]

'<|im_start|>user\nWhen is it possible to confirm pregnancy with a test?<|im_end|>\n<|im_start|>assistant\nIt will take a few more weeks before pregnancy can be confirmed with a pregnancy test. <|im_end|>\n'

In [52]:
dataset = dataset.train_test_split(test_size=0.1)

In [53]:
os.environ["WANDB_DISABLED"] = "false"

In [55]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

# Create the Trainer without deprecated arguments
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [56]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
191,0.8341,0.725282
382,0.3187,0.509897
573,0.218,0.488615
764,0.3641,0.508838




TrainOutput(global_step=951, training_loss=0.4654362341144233, metrics={'train_runtime': 2635.7596, 'train_samples_per_second': 0.722, 'train_steps_per_second': 0.361, 'total_flos': 4446709845147648.0, 'train_loss': 0.4654362341144233, 'epoch': 3.0})

In [57]:
# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.5168211460113525, 'eval_runtime': 43.4787, 'eval_samples_per_second': 1.633, 'eval_steps_per_second': 1.633, 'epoch': 3.0}


In [58]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▂▁▂▂
eval/runtime,▆▅▅▁█
eval/samples_per_second,▁▁▁█▁
eval/steps_per_second,▁▁▁█▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▃█▃▂▂▃▃▂▃▅▃▃▃▁▁▁▁▂▁▂▁▃▁▃▁▁▃▁▂▁▂▃▃▁▂▁▂▂▁▁
train/learning_rate,▇███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▂▃▇▂▄▄▇▅▅▁▁▁▁▄▁▃▁▅▁▅▁▂▄▂▂▁▂▂▃▁▂▁▂▁▁▁

0,1
eval/loss,0.51682
eval/runtime,43.4787
eval/samples_per_second,1.633
eval/steps_per_second,1.633
total_flos,4446709845147648.0
train/epoch,3.0
train/global_step,951.0
train/grad_norm,4.32356
train/learning_rate,0.0
train/loss,0.5285


In [59]:
messages = [
    {
        "role": "user",
        "content": "Why might mothers not realize they are already pregnant in the first two weeks?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=100, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


Mothers might not realize they are pregnant because the pregnancy calendar starts counting from the LMP, not from the actual conception date.  |



In [61]:
from huggingface_hub import HfApi, HfFolder

# Set the token
HfFolder.save_token(hf_token)  

api = HfApi()
whoami = api.whoami(token=hf_token)
print(f"Logged in as: {whoami['name']}")

Logged in as: HelpMum-Personal


In [62]:
trainer.model.push_to_hub(new_model, use_temp_dir=True, token=hf_token)
trainer.tokenizer.push_to_hub(new_model, use_temp_dir=True, token=hf_token)



adapter_model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HelpMum-Personal/mamabot-llama-1/commit/eb24542b5f31cb5ef92804420bb2484502e8c841', commit_message='Upload tokenizer', commit_description='', oid='eb24542b5f31cb5ef92804420bb2484502e8c841', pr_url=None, pr_revision=None, pr_num=None)

In [63]:
# Save the model and tokenizer
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('mamabot-llama-1/tokenizer_config.json',
 'mamabot-llama-1/special_tokens_map.json',
 'mamabot-llama-1/tokenizer.json')