In [1]:
%%capture
!pip install faiss-gpu langchain langchain-community langchain-openai langchain-huggingface
!curl http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json > dev_distractor.json

In [2]:
# Вот это надо заменить на модель OpenAI
# from langchain_ollama import ChatOllama
# llm = ChatOllama(model="phi3.5")
from langchain_openai import ChatOpenAI
import os
os.environ['OPENAI_API_KEY']="INSERT YOUR TOKEN HERE"
llm = ChatOpenAI(model="gpt-4o-mini-2024-07-18", max_tokens="256")

In [3]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings

In [4]:
import pandas as pd
import json
from tqdm import tqdm
# Предобработка HotPotQA
df = pd.read_json('/content/dev_distractor.json')
all_paragraphs = dict()

for list_of_paragraphs, q_id in zip(df['context'], df['_id']):
    for paragraph in list_of_paragraphs:
        paragraph_main_text = ' '.join([f"{n}. {i}" for n, i in enumerate(paragraph[1])])

        if (text := f"Title: {paragraph[0]}\nSentences: {paragraph_main_text}") not in all_paragraphs:
            all_paragraphs[text] = [q_id,]
        else:
            all_paragraphs[text].append(q_id)

documents = [text for n, text in enumerate(all_paragraphs.keys())]
metadata = [val for val in all_paragraphs.values()]
print(f"Количество строк в датасете: {len(df)}")
print(f"Количество документов: {len(documents)}")

Количество строк в датасете: 7405
Количество документов: 66635


In [5]:
from langchain_core.documents import Document

# Сюда заливаются доки
docs = [Document(text) for text in documents]

# Выбор модели эмбеддингов
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 0}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Создание векторной базы данных
db = FAISS.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 20})

In [7]:
import logging
# logging.basicConfig(level=logging.INFO, force=True)

decompose_template = """
Decompose the following question into multiple simple questions.
Decomposed questions must be concrete, self-sufficient and very short.
If the question can not be decomposed, just repeat the original question.
AVOID ANY CONVERSATIONAL ANSWERS
Example:
Original Question: What administrative territorial entity is the owner of Ciudad Deportiva located?
Decomposed Questions: 1. Who is the owner of Ciudad Deportiva?
2. Where is #1 located?

Original Question: Who is the child of the founder of the company that distributed the film UHF?
Decomposed Questions: 1. What company distributed the film UHF?
2. Who founded #1?
3. Who is #2's child?

Actual Data:
Original Question: {question}
Decomposed Questions:
"""

rag_template = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the CURRENT QUESTION.
If you don't know the answer, just say "I DON'T KNOW".
Use ten words maximum and keep the answer concise and concrete
Main question: {main_question}
Previous sub-questions: {prev_questions}

CURRENT QUESTION: {subquestion}
Context: {context}
Reply with valid json with following schema:
{{"answer": str, "source": [{{source_title}}: str, {{source_sentence_id}}: int]}}
"""

final_answer_template = """
You are an assistant for question-answering tasks.
Use the following sub-questions and corresponding answers to answer the MAIN QUESTION.
If you don't know the answer, just say "I DON'T KNOW".
Use ten words maximum and keep the answer concise and concrete.
Previous sub-questions: {answers}
MAIN QUESTION: {original_question}
Avoid any conversational answers.
"""

decompose_prompt = PromptTemplate.from_template(decompose_template)
rag_prompt = PromptTemplate.from_template(rag_template)
final_answer_prompt = PromptTemplate.from_template(final_answer_template)

In [8]:
decompose_chain = decompose_prompt | llm | StrOutputParser()

rag_chain = (
    rag_prompt
    | llm
    | StrOutputParser()
)

final_answer_chain = final_answer_prompt | llm | StrOutputParser()

import re
leave_only_json = re.compile(r'''(\{.+\})''')

In [9]:
keys_mapping = {'source_title': re.compile(r'title'), 'source_sentence_id': re.compile(r'id$')}

In [15]:
def multi_hop_qa(original_question, max_iterations=6):

    decomposed_questions = decompose_chain.invoke({"question": original_question})

    # Удаление лишнего, если модель зациклилась
    decomposed_questions = decomposed_questions[:decomposed_questions.lower().find('original question')]
    # Разделение на подвопросы, если они есть
    if "\n" in decomposed_questions:
        decomposed_questions = decomposed_questions.split("\n")
    else:
        decomposed_questions = [decomposed_questions]
    decomposed_questions = [i for i in decomposed_questions if len(i) > 10]

    logging.info(f"Decomposed_questions: {decomposed_questions}")

    answers = []
    context_for_final_answer = []
    supporting_idx = dict()

    for i, question in enumerate(decomposed_questions):
        if i > max_iterations:
            break

        # Убираем цифры и точки из начала строки, если они есть
        question = question.lstrip("1234567890. ")

        if i > 0:
            composed_question = f"Based on the answer '{rag_answer}', {question}"
        else:
            composed_question = question
        logging.info(f"--- Question: {question}")

        context = retriever.invoke(composed_question)

        rag_answer = rag_chain.invoke(
            {
                "subquestion": question,
                "context": context,
                "main_question": original_question,
                "prev_questions": "\n".join(context_for_final_answer)
                }
            )

        if rag_answer.upper().find("I DON'T KNOW") != -1:
            continue

        logging.info(f"--- RAG answer: {rag_answer}")
        if '{' in rag_answer and '}' in rag_answer:
            rag_answer = re.findall(leave_only_json, ' '.join(rag_answer.split()))[0]
            try:
                rag_answer = json.loads(rag_answer)
                answer_text = rag_answer['answer']

                answer_source = rag_answer['source']
                if isinstance(answer_source, list) and len(answer_source) > 0:
                    for source in answer_source:
                        if isinstance(source, dict):
                            cleaned_source_dict = dict()
                            for key_name, key_regex in keys_mapping.items():
                                for key in source.keys():
                                    if re.findall(key_regex, key.lower()):
                                        cleaned_source_dict[key_name] = source[key]
                            try:
                                supporting_idx[cleaned_source_dict['source_title']] = int(cleaned_source_dict['source_sentence_id'])
                            except (ValueError, KeyError):
                                pass
                else:
                    pass
            except (json.JSONDecodeError, KeyError, ValueError):
                answer_text = rag_answer.strip('{}[],;:" ')
        else:
            answer_text = rag_answer.strip('" ')

        answers.append(answer_text)

        context_for_final_answer.append(f"Question {i}: {question}\nAnswer {i}: {answer_text}")

        logging.info(f"--- Answer: {answer_text}")

    final_answer = final_answer_chain.invoke({
        "original_question": original_question,
        "answers": "\n".join(context_for_final_answer)
    })
    return final_answer, supporting_idx

In [16]:
from tqdm import tqdm

In [12]:
print(df['question'][0])

Were Scott Derrickson and Ed Wood of the same nationality?


In [13]:
print(multi_hop_qa(df['question'][0]))

('Yes, both are American.', {'Ed Wood': 0, 'Nationality': 0})


In [17]:
# Проверим 1000 вопросов
result = {'answer': dict(), 'sp': dict()}
for q_n, question_text in tqdm(enumerate(df['question'][:1000].to_list()), total=1000):
    q_id = df['_id'][q_n]

    ans = multi_hop_qa(question_text)
    supporting = list()

    for i in ans[1].items():
        supporting.append(list(i))

    result['answer'][q_id] = ans[0]
    result['sp'][q_id] = supporting

with open('result.json', 'w') as f:
    json.dump(result, f)

100%|██████████| 1000/1000 [1:42:21<00:00,  6.14s/it]


In [18]:
df.iloc[0]['_id']

'5a8b57f25542995d1e6f1371'

In [19]:
result['answer']['5a8b57f25542995d1e6f1371']

'Yes, both were American.'

In [20]:
result['sp']['5a8b57f25542995d1e6f1371']

[['Scott Derrickson', 0], ['Ed Wood', 0], ['Nationality', 0]]

In [21]:
df.iloc[0]['answer']

'yes'

In [24]:
df.iloc[0]['supporting_facts']

[['Scott Derrickson', 0], ['Ed Wood', 0]]

In [27]:
len(result['answer'])
len(result['sp'])

1000