In [30]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from transformers import AutoTokenizer
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from huggingface_hub import notebook_login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
qa_datase_name = "databricks/databricks-dolly-15k"
loader = HuggingFaceDatasetLoader(qa_datase_name, "context")
data = loader.load()

In [5]:
data[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 150)

In [7]:
docs = text_splitter.split_documents(data)

In [8]:
docs[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

In [9]:
model_path = "sentence-transformers/all-MiniLM-l6-v2"

In [10]:
embeddings = HuggingFaceEmbeddings(
    model_name = model_path,
    model_kwargs = {'device': 'cpu'},
    encode_kwargs = {'normalize_embeddings': False}
)

In [11]:
text = "This is a test sentence"
query_result = embeddings.embed_query(text)
query_result[:3]

[0.07155241072177887, 0.0684802308678627, 0.006603313609957695]

In [17]:
db = FAISS.from_documents(docs, embeddings)

In [20]:
db.save_local("faiss_index")

In [12]:
db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [13]:
question = "Who is Thomas Jefferson?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

"Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams."


In [14]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [16]:
llm = HuggingFacePipeline.from_model_id(
    model_id=model_name,
    task="text-generation",
    pipeline_kwargs={"temperature": 0.7, "max_new_tokens": 512},
    device=0
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [17]:
llm.invoke("Chinese energy")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


"Chinese energy sector.\n\n2. China's Belt and Road Initiative (BRI): China's BRI is a massive infrastructure development project that aims to connect Asia, Europe, and Africa through a network of roads, railways, ports, and other infrastructure projects. The BRI has been criticized for its lack of transparency and its potential to exacerbate debt traps for developing countries.\n\n3. China's trade policies: China's trade policies have been criticized for their protectionist nature and their potential to harm global trade and economic growth. Some argue that China's trade policies have led to a trade imbalance and have contributed to global economic imbalances.\n\n4. China's environmental policies: China's environmental policies have been criticized for their lack of transparency and their potential to harm the environment. Some argue that China's environmental policies have led to pollution and have contributed to global environmental challenges.\n\n5. China's human rights record: Chi

In [18]:
retriever = db.as_retriever()

In [67]:
prompt_template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

In [68]:
qa_llm = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    chain_type_kwargs={'prompt':prompt}
)

In [76]:
question = "What is the latest trend for solar investments in China?"
result = qa_llm.run({"query":question})
print(result)




Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Keep the answer as concise as possible. 
"The Sun Belt has seen substantial population growth since post-World War II from an influx of people seeking a warm and sunny climate, a surge in retiring baby boomers, and growing economic opportunities. The advent of air conditioning created more comfortable summer conditions and allowed more manufacturing and industry to locate in the Sun Belt. Since much of the construction in the Sun Belt is new or recent, housing styles and design are often modern and open. Recreational opportunities in the Sun Belt are often not tied strictly to one season, and many tourist and resort cities, such as Fort Lauderdale, Gulf Shores, Houston, Las Vegas, Los Angeles, Miami, Myrtle Beach, New Orleans, Orlando, Palm Springs, Phoenix, San Antonio, San Diego, Tampa, and Tucson support a tourist indust

In [75]:
# call
result['result'][3682:]

"Helpful Answer: The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. The country has set a target of generating 100 GW of solar power by 2020, and the government has provided subsidies and tax breaks to encourage investment in the sector. In 2018, China's solar capacity grew by 11.5 GW, making it the world's largest solar market. The country is also investing in wind power, with a target of generating 100 GW of wind power by 2020. The government has also introduced policies to promote the use of energy-efficient technologies and to encourage the development of smart grids."

In [78]:
# run function
result[result.index('Helpful Answer'):]

"Helpful Answer: The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. The country has set a target of generating 100 GW of solar power by 2020, and the government has provided subsidies and tax breaks to encourage investment in the sector. In 2018, China's solar capacity grew by 11.5 GW, making it the world's largest solar market. The country is also investing in wind power, with a target of generating 100 GW of wind power by 2020. The government has also introduced policies to promote the use of energy-efficient technologies and to encourage the development of smart grids."

In [87]:
system_prompt = """
You are a helpful specialist. Answer the question below.
Keep the answer as concise as possible.
"""
question = "What is the latest trend for solar investments in China?"

query = f"question: {question}"
prompt = PromptTemplate.from_template(system_prompt+"{content}")
model_chain = LLMChain(llm=llm, prompt=prompt)
result = model_chain.run({query})
result



"\nYou are a helpful specialist. Answer the question below.\nKeep the answer as concise as possible.\n{'question: What is the latest trend for solar investments in China?'}\n{'answer: The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. According to a report by BloombergNEF, China's solar capacity grew by 33% in 2020, with the government's target of 100 GW of installed capacity by 2025 set to be met. The report also noted that the country's solar investment is expected to reach $100 billion by 2025.'}"

In [88]:
print(result)


You are a helpful specialist. Answer the question below.
Keep the answer as concise as possible.
{'question: What is the latest trend for solar investments in China?'}
{'answer: The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. According to a report by BloombergNEF, China's solar capacity grew by 33% in 2020, with the government's target of 100 GW of installed capacity by 2025 set to be met. The report also noted that the country's solar investment is expected to reach $100 billion by 2025.'}


In [92]:
result[result.index('answer: ')+len('answer: '):]

"The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. According to a report by BloombergNEF, China's solar capacity grew by 33% in 2020, with the government's target of 100 GW of installed capacity by 2025 set to be met. The report also noted that the country's solar investment is expected to reach $100 billion by 2025.'}"

Ответы ллмки для сравнения:
1. RAG
2. No RAG

The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. The country has set a target of generating 100 GW of solar power by 2020, and the government has provided subsidies and tax breaks to encourage investment in the sector. In 2018, China's solar capacity grew by 11.5 GW, making it the world's largest solar market. The country is also investing in wind power, with a target of generating 100 GW of wind power by 2020. The government has also introduced policies to promote the use of energy-efficient technologies and to encourage the development of smart grids.

"The latest trend for solar investments in China is the government's push for renewable energy, which has led to a surge in investment in the sector. According to a report by BloombergNEF, China's solar capacity grew by 33% in 2020, with the government's target of 100 GW of installed capacity by 2025 set to be met. The report also noted that the country's solar investment is expected to reach $100 billion by 2025.

Можно заметить, что начало у них одинаковое, но при этом они различаются. Ответ с RAG дает больше информации на основе контекста, поданного ему в промпт. Контекст содержит лишь один документ(по-моему по дефолту retriever выбирает топ 1 документ, количество доков я не устанавливал.) Второй же ответ ссылается на источник информации: BloombergNEF. RAG штука довольно таки мощная, но с ней генерация идет дольше. Возможно из-за обработки большего количества токенов(вероятнее всего). Но ее главный плюс - fine-tuning модели, используя лишь набор документов в какой-то базе знаний значителен(я имею ввиду быстроту). Единственное есть проблема с выходом генерации, в том плане, что он выводит еще и запрос сделанный модели. Но тут уже скорее всего я накосячил, хз, т.к. если запускать модель без пайплайна(chain), то он выводит чисто ответ.

Модель: https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

Документы для БЗ: https://huggingface.co/datasets/databricks/databricks-dolly-15k

Эмбеддинги: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2