# Running an LLM on Local Machine (M1 Pro Mac with 16GB RAM & 1TB SSD)

## Download LLM model from Hugging Face
- [🤗 Hugging Face - fastchat-t5-3b-v1.0](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0/tree/main
)

In [1]:
!pip install 'langchain[llms]' huggingface-hub langchain transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os 
from huggingface_hub import hf_hub_download

In [None]:
HUGGING_FACE_API_TOKEN = os.getenv("HUGGING_FACE_API_KEY")

In [4]:
# download model from huggingface
model_name = 'lmsys/fastchat-t5-3b-v1.0' 
filenames = [ 
    "pytorch_model.bin", "added_tokens.json", "config.json", "generation_config.json",
    "special_tokens_map.json", "spiece.model", "tokenizer_config.json"
]

In [5]:
for filename in filenames:
    dowloaded_model_path = hf_hub_download(
        repo_id = model_name,
        filename=filename,
        use_auth_token=HUGGING_FACE_API_KEY
    )
    print(dowloaded_model_path)
    
print(dowloaded_model_path)    

/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/pytorch_model.bin
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/added_tokens.json
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/config.json
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/generation_config.json
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/special_tokens_map.json
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/spiece.model
/Users/0xnrous/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024/tokenizer_config.json
/U

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM


tokenizer = AutoTokenizer.from_pretrained(model_name, legacy = False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1 , max_length=1000)

In [None]:
pipeline("What is the capital of Egypt?")

--------

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

In [None]:
# use model locally with LangChain to create repeatable structure around the prompt template
llm = HuggingFacePipeline.from_model_id(
    model_id=model_name,
    task="text2text-generation",
    model_kwargs={"temperature": "0","max_length" : 1000},
    use_auth_token=HUGGING_FACE_API_TOKEN
)

In [None]:
template = """
You are a friendly chatbot assistant that response conversationally to users' question. 
Keep the answer short, unless specifically asked by the user to elaborate on something. 

Question: {question}

Answer:"""

Prompt = PromptTemplate(Template = template, input_variables=["question"])
llm_chain = LLMChain(prompt=Prompt, llm=llm)

In [None]:
# create conversation function that ask questions and prints the response 
def ask(question):
    response = llm_chain(question)
    print(response['question'])
    print("")
    print(response['text'])

In [None]:
# create timer context manager to make sure the conversation doesn't run for too long
import time 


class TimerError(Exception):
    """A custom exception used to report errors in use of Timer class"""

class Timer:
    def __init__(self, seconds):
        self.seconds = seconds

    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, *args):
        self.end = time.perf_counter()
        self.interval = self.end - self.start
        if self.interval > self.seconds:
            raise TimerError(f"Timer for {self.seconds} seconds exceeded")
        print(f"Timer: {self.interval} seconds")

In [None]:
with Timer():
    ask("What is the capital of Egypt?")

In [None]:
with Timer():
    ask("Describe some famous landmarks in Egypt?")

### Check model running while disable wifi connectivity

In [None]:
from utils import check_connectivity, toggle_wifi 

print(check_connectivity())
toggle_wifi("off")
time.sleep(0.7)
print(check_connectivity())