In [None]:
from time import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from tqdm import tqdm
import json

# ================== Model Configuration ==================
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

llama31_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# ==========================================================

# ================== Helper Functions ==================

def query_model(system_message, user_message, temperature=0.7, max_length=1024):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": "Answer this question: " + user_message}
    ]
    prompt_text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = llama31_pipeline(
        prompt_text,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    answer = sequences[0]['generated_text']
    return answer

system_message = """
You are an AI trained to answer questions related to oral health.
"""

# ==========================================================

# ================== Data Generation ==================
answer = query_model(system_message, "What is caries in teeth?", temperature=0.7, max_length=256)

print(f'Answer: {answer}')

# ==========================================================

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
import torch

# Define the local directory to save/load the model and tokenizer
local_model_dir = "./saved_model"

# Check if the model and tokenizer are already saved locally
if os.path.exists(local_model_dir):
    print("Loading model and tokenizer from local directory...")
    # Load the model and tokenizer from the local directory
    tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
    model = AutoModelForCausalLM.from_pretrained(
        local_model_dir,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map="auto"
    )
else:
    print("Downloading model and tokenizer...")
    # Download the model and tokenizer and save them locally
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        resume_download=True
    )
    # Save the model and tokenizer to local directory
    tokenizer.save_pretrained(local_model_dir)
    model.save_pretrained(local_model_dir)
    print(f"Model and tokenizer have been downloaded and saved to {local_model_dir}")

# Initialize the pipeline with the loaded or downloaded model and tokenizer
llama31_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Example query to test the model
system_message = """
You are an AI trained to answer questions related to oral health.
"""

def query_model(system_message, user_message, temperature=0.7, max_length=1024):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": "Answer this question: " + user_message}
    ]
    prompt_text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = llama31_pipeline(
        prompt_text,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    answer = sequences[0]['generated_text']
    return answer

# Generate an answer
answer = query_model(system_message, "What is caries in teeth?", temperature=0.7, max_length=256)
print(f'Answer: {answer}')


Downloading model and tokenizer...


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading (…)of-00004.safetensors:  15%|#4        | 724M/4.92G [00:00<?, ?B/s]