# Loading local Huggingface models

From https://python.langchain.com/docs/integrations/llms/huggingface_pipelines


In [None]:
%pip install --upgrade --quiet  transformers --quiet

In [None]:
!pip install langchain langchain_community

In [None]:
import langchain_community

In [None]:
# this version works on the public servers without an API token, but is slower
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 100},
)

In [None]:
from langchain.prompts import PromptTemplate

template = """Question: {question}

Answer: """
prompt = PromptTemplate.from_template(template)

chain = prompt | hf

question = "How much is that doggy in the window?"

print(chain.invoke({"question": question}))

# Using Huggingface endpoints

From https://python.langchain.com/docs/integrations/llms/huggingface_endpoint

Huggingface serverless inference is described here:

https://huggingface.co/docs/api-inference/index

This explains how to get an API token from your account, and how to construct the model URL. It also gives code for using a model, but we will instead used Langchain to interface to the Huggingface API.

You can set up your own dedicated endpoint to which you deploy a model, giving better availability than the public endpoints. There is a cost:

https://huggingface.co/inference-endpoints/dedicated

UI for starting and stopping and configuring endpoints is here:

https://ui.endpoints.huggingface.co/angusroberts/endpoints


protected endpoint - seems to need own token
public endpoint - still needs a token, but can be any




In [None]:
# get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

from getpass import getpass
import os

HUGGINGFACEHUB_API_TOKEN = getpass()

# We put the token in an environment variable, from where Langchain will access it when needed
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [None]:
# This example uses the Huggingface API direct
import requests
API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()
data = query("When it rains it ")
print(data)

In [None]:
!pip install langchain langchain_community
import langchain_community
from langchain_community.llms import HuggingFaceEndpoint

In [None]:
%pip install --upgrade --quiet huggingface_hub

In [None]:
endpoint_url = "https://j278zkynwm0b3dky.eu-west-1.aws.endpoints.huggingface.cloud"

In [None]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [None]:
question = "How much is that doggy in the window? "

template = """Question: {question}

Answer: """

prompt = PromptTemplate.from_template(template)

In [None]:


# Using the free model endpoint, which has limited models
#
# repo_id = "openai-community/gpt2"
# llm = HuggingFaceEndpoint(
#    endpoint_url="https://api-inference.huggingface.co/models/" + repo_id,
#    task="text-generation",
#    model_kwargs={"max_length": 128, "temperature": 0.1}
#)

# Using the paid for model endpoint, which can host a wider range of models
#llm = HuggingFaceEndpoint(
#    endpoint_url=endpoint_url,
#    task="text-generation",
#    temperature=0.1,
#    model_kwargs={"max_length": 128}
#)


# TRY THIS NEXT:
llm = HuggingFaceEndpoint(
    endpoint_url=endpoint_url,
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
)
llm("What did foo say about bar?")


#llm_chain = LLMChain(prompt=prompt, llm=llm)
#print(llm_chain.run(question))