In [10]:
import transformers
import torch
from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv()) 

model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct.
401 Client Error. (Request ID: Root=1-66ace492-3db46d665dcbe5222b6dbd39;29b9e376-c752-4bda-ab40-277b8c3aa45b)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-70B-Instruct is restricted. You must be authenticated to access it.

In [2]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt,
                     llm=HuggingFaceHub(repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                                        model_kwargs={"temperature":0.2,
                                                      "max_length":128}))
question = "What is the relations between China and Taiwan?"

print(llm_chain.run(question))

Question: What is the relations between China and Taiwan?

Answer: Let's think step by step.

China and Taiwan are two separate countries, but they share a history and cultural heritage. China and Taiwan have been united under the same government since 1949, and they have a long history of diplomatic relations.

China and Taiwan have a long history of political and cultural ties. Both countries have a long history of diplomatic relations, dating back to the 19th century. The two countries have shared a common language, culture,


## Save and load models locally

In [2]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

In [3]:

model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'# go for a smaller model if you dont have the VRAM,  meta-llama/Meta-Llama-3.1-8B-Instruct, 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,
    max_length =2000
)

local_llm = HuggingFacePipeline(pipeline=pipe)

pipe.save_pretrained('/Users/yin/huggingface-models/TinyLlama/TinyLlama-1.1B-Chat-v1.0')

In [5]:
#Load model
model_path = '/Users/yin/huggingface-models/TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0,
    max_length =2000
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [8]:
template = """Question: {question}

Answer: """

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )

question = "Introduce Taiwanese."
print(llm_chain.invoke(question))

{'question': 'Introduce Taiwanese.', 'text': 'Question: Introduce Taiwanese.\n\nAnswer: 台灣 (Taiwan)\n\n2. "What is the capital of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 台北 (Taipei)\n\n3. "What is the official language of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 繁體中文 (Chinese)\n\n4. "What is the currency of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 台灣幣 (Taiwanese currency)\n\n5. "What is the national anthem of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 台灣歌 (Taiwanese national anthem)\n\n6. "What is the national flag of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 紅旗 (Red flag)\n\n7. "What is the national emblem of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 紅旗 (Red flag)\n\n8. "What is the national motto of Taiwan?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 發展尊丟 (Develop with pride)\n\n9. "What is the official website of the Taiwanese government?"\n\nQuestion: Introduce Taiwanese.\n\nAnswer: 台灣政府網 (Taiwan governmen

In [3]:

from langchain_huggingface import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv()) 

model_id = 'meta-llama/Meta-Llama-3.1-70B-Instruct' #  meta-llama/Meta-Llama-3.1-8B-Instruct, TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_path = 'models/meta-llama/Meta-Llama-3.1-70B-Instruct'

Save_model = True
if Save_model == True:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,
        max_length =2000
)

    local_llm = HuggingFacePipeline(pipeline=pipe)

    pipe.save_pretrained(model_path)
    print(f'Model {model_id} saved.')

else:


    #Load model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,
        max_length =2000
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)
    print(f'Model {model_path} loaded.')

    template = """Question: {question}

    Answer: """

    prompt = PromptTemplate(template=template, input_variables=["question"])

    llm_chain = LLMChain(prompt=prompt,
                        llm=local_llm
                        )

    question = "Introduce Taiwanese."
    print(llm_chain.invoke(question))

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

model-00001-of-00030.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

KeyboardInterrupt: 