In [1]:
import os
from dotenv import load_dotenv

load_dotenv()  # to make OPENAI_API_KEY and LANGCHAIN_API_KEY available

True

In [9]:
# Basics of vectorisation and embedding

import numpy as np

import tiktoken
from langchain_openai import OpenAIEmbeddings

embd = OpenAIEmbeddings(model="text-embedding-3-small")

question = "What kind of pets do I like?"
document = "My favorite pet is cat"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string"""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string(question, "cl100k_base"))

query_result = embd.embed_query(question)
document_result = embd.embed_query(document)

print(len(query_result))

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

8

In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.vectorstores import Chroma

PDF_DOCS_PATH = "./pdf_docs"
PDF_DOCS_CHROMA_PATH = "./chroma_data"
EMBEDDING_MODEL = "text-embedding-3-small"

loader = PyPDFDirectoryLoader(PDF_DOCS_PATH)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=200, chunk_overlap=40
)

documents = text_splitter.split_documents(docs)

vectorestore = Chroma.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(model=EMBEDDING_MODEL),
    persist_directory=PDF_DOCS_CHROMA_PATH
)

# retriever = vectorestore.as_retriever()

In [21]:
retriever = vectorestore.as_retriever(search_kwargs={"k": 2})

In [22]:
docs = retriever.get_relevant_documents("Who owns 'DenKoridZen' company?")

In [23]:
len(docs)

2

In [27]:
docs

[Document(page_content='Denis Korolev is a highly qulified software engineer with wide range of skills:\n- Web-development\n- AI-development\n- Edtech-development\n- System Design\nHis favorite Programming Language is Python. \nHe lives in Georgia since 2023. Recently he founded an Interior Design Agency «DenKoridzeN» \nwhich provides a high quality B2B service for clients from all over the world. \n«DenKoridzeN» agency got famouse for the interior-remodeling project of the yacht «Dona Xenia».\nThe project was finished in May of 2024. The cost of the project is $2450000.\nAgency has 22 employees. Denis is planning that his agency have 40 employees till the end of \n2025.', metadata={'page': 0, 'source': 'pdf_docs/dk_story.pdf'}),
 Document(page_content='Ilya Pischalnikov is mega businessman who owns quite a few companies. This is only few of the \ncompanies that he is running:\n- «Notishop»\n- «TouchIP»\n- «SuperBot72»\n- «DevTrix&Co»\n- «IPGazMyas»\nIlya owns two yachts: «Dona Xenia» 

In [30]:
# post-processing
def format_docs(documents):
    return "\n---\n".join(doc.page_content for doc in docs)

In [31]:
formatted_docs = format_docs(docs)
formatted_docs

'Denis Korolev is a highly qulified software engineer with wide range of skills:\n- Web-development\n- AI-development\n- Edtech-development\n- System Design\nHis favorite Programming Language is Python. \nHe lives in Georgia since 2023. Recently he founded an Interior Design Agency «DenKoridzeN» \nwhich provides a high quality B2B service for clients from all over the world. \n«DenKoridzeN» agency got famouse for the interior-remodeling project of the yacht «Dona Xenia».\nThe project was finished in May of 2024. The cost of the project is $2450000.\nAgency has 22 employees. Denis is planning that his agency have 40 employees till the end of \n2025.\n---\nIlya Pischalnikov is mega businessman who owns quite a few companies. This is only few of the \ncompanies that he is running:\n- «Notishop»\n- «TouchIP»\n- «SuperBot72»\n- «DevTrix&Co»\n- «IPGazMyas»\nIlya owns two yachts: «Dona Xenia» and «Nordev». All yachts are considerered to be a middle size \nyachts: «Nordev» has a length of 65.5

In [3]:
# Generation

from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))]


In [11]:
# LangChain Expression Language (LCEL) chain = prompt | llm | ... | ...

chain = prompt | llm

In [15]:
chain.invoke({"context": docs, "question": "When remodeling of 'Dona Xenia' was finished?"})

AIMessage(content="The remodeling of 'Dona Xenia' was finished in May of 2024.", response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 224, 'total_tokens': 242}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-a9bef3cc-66d4-4cb8-a3e5-9bf8d52ddef7-0', usage_metadata={'input_tokens': 224, 'output_tokens': 18, 'total_tokens': 242})

In [17]:
# one of the options for RAG-promts from LangChain
from langchain import hub

prompt_hub_rag = hub.pull("rlm/rag-prompt")  # pulling from HUB

In [18]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [19]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])

In [5]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

retriever = vectorestore.as_retriever(search_kwargs={"k": 1})

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("How long the Dadashevich case was?")

'The Dadashevich case lasted for 3 years.'

In [7]:
########################################### 
#   Multi Query: Different Perspectives   #
###########################################

retriever = vectorestore.as_retriever()

template_p = """You are an AI assistant. Your task is to generate five different versions of the given user question to
retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal
is to help the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template_p)

generate_queries = (
    prompt_perspectives
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [55]:
# My list comprehension practice 
al = [['Hey', 'take', 'your', 'mama', 'out'], ['With', 'one', 'head', 'light'], ['Bye', 'bye', 'miss', 'American', 'Pie']]
bl = [doc.upper() for song in al for doc in song]
cl = [[doc.upper() for doc in song] for song in al]
print(bl)
print(cl)

['HEY', 'TAKE', 'YOUR', 'MAMA', 'OUT', 'WITH', 'ONE', 'HEAD', 'LIGHT', 'BYE', 'BYE', 'MISS', 'AMERICAN', 'PIE']
[['HEY', 'TAKE', 'YOUR', 'MAMA', 'OUT'], ['WITH', 'ONE', 'HEAD', 'LIGHT'], ['BYE', 'BYE', 'MISS', 'AMERICAN', 'PIE']]


In [9]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """Unique union of retrieved documents"""
    # Flattened list of lists, and convert each document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

question = "Give the year of start and year of finish of the longest of Denis Borodin's jobs/projects?"

In [10]:
retrieval_chain = generate_queries | retriever.map() | get_unique_union

In [53]:
docs = retrieval_chain.invoke({"question": question})

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
  warn_beta(


In [54]:
docs

[Document(page_content='Ilya Pischalnikov is mega businessman who owns quite a few companies. This is only few of the \ncompanies that he is running:\n- «Notishop»\n- «TouchIP»\n- «SuperBot72»\n- «DevTrix&Co»\n- «IPGazMyas»\nIlya owns two yachts: «Dona Xenia» and «Nordev». All yachts are considerered to be a middle size \nyachts: «Nordev» has a length of 65.5 meters, and «Dona Xenia» is 73 meters long. The design of \nyachts interior was made by Georgian design agency «DenKoridzeN»\nIlya takes 15 th place in Gorbes Rating List with the net worth of $2.5 billions.', metadata={'page': 0, 'source': 'pdf_docs/ip_story.pdf'}),
 Document(page_content='Denis Borodin is a great lawer  and is best known for his legal aid in the D.Dadashevich case.\nHe received his law degree from the University of Irkutsk, and after taking time off of being a \nlawyer to pursue other businesses, he reactivated his lawyer’s license following Mr. Dadashevich’s \narrest in 2020.\nDue to the complexity of D.Dadashe

In [11]:
# RAG

from operator import itemgetter

template_1 = """Answer the following question based on this context:

{context}

Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(template_1)

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")}
    | rag_prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

  warn_beta(


'Denis Borodin started his career as a lawyer in 2020 and finished the D.Dadashevich case in 2023, which lasted for 3 years.'

In [12]:
final_rag_chain.invoke({"question": "Does Denis Korolev somehow cooperates with Ilya Pischalnikov?"})

"Based on the provided context, Denis Korolev, a highly qualified software engineer, founded an interior design agency called «DenKoridzeN» which provided interior remodeling services for Ilya Pischalnikov's yacht «Dona Xenia». Therefore, it can be inferred that Denis Korolev does cooperate with Ilya Pischalnikov in providing interior design services for his yacht."

In [13]:
final_rag_chain.invoke({"question": "How old is Roman Bazhenov - a young hockey player from Irkutsk Region?"})

'Roman Bazhenov is approximately 16 years old, as he was born in 2009.'

In [14]:
final_rag_chain.invoke({"question": "Does Nikolay have chance to use skills obtained during his study at university at work?"})

"Yes, Nikolay has the chance to use the skills obtained during his study at university at work. He graduated with a Bachelor's Degree in AeroSpace technology and started his IT career as a Java Developer, eventually transitioning to DevOps. His education and skills in technology and IT have allowed him to excel in his career and take on roles that utilize his knowledge and expertise."

In [21]:
###########################
#   RAG-Fusion: Related   #
###########################

template_2 = """You are a helpful assistant that generates multiple search queries based on a single input query.\n
Generate multiple search queries related to: {question}\n
Output (4 queries):"""

prompt_rag_fusion = ChatPromptTemplate.from_template(template_2)

generate_queries = (
    prompt_rag_fusion
    | llm
    | StrOutputParser()
    |(lambda x: x.split("\n"))
)

In [22]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """Reciprocal_rank_fusion that takes multiple lists of ranked 
    documents and an optional parameter k used in the RRF formula"""

    fused_scores = {}  # to hold fused scores for each unique document

    for docs in results:
        for rank, doc in enumerate(docs):  # iterate through each document in the list with its rank (position in the list)
            doc_str = dumps(doc)
            if doc_str not in fused_scores:  # if not in fused_scores then add it with initial score of 0
                fused_scores[doc_str] = 0
            
            previous_score = fused_scores[doc_str]  # retrieve the current score of the doc if any
            
            fused_scores[doc_str] += 1 / (rank + 1)  # update the score of the doc with formula: 1 / (rank + 1)

    # sort the docs based on their fused scores in DESC order to get the final reranked results
    reranked_results = [
        (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results  # list of tuples like (doc, fused_score)

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": "Does Nikolay have chance to use skills obtained during his study at university at work?"})

In [23]:
print(len(docs))
print(docs)

6
[(Document(page_content='Nikolay Nikitin graduated from Moscow Institute of Physics and Technology in 2023 with \nBachelor`s Degree in AeroSpace technology.\nNikolay has started his IT-career two years before graduation as a Java Developer at Sberbank. \nAfter six month of working as backend developer he decided to switch to the position of DevOps at \nthe same company.\nThat was a good decision because new position gave Nikolay a higher salary and less stress.\nSince then the most of his daily workload consists of supervision of K8S clusters and orchestration \nof Docker containers. In may of 2024 Nikolay has completed an “AWS Technical Essentials“ \ncourse which allowes him to work as AWS certified specialist.\nNikolay spends his free time playing video games. His favourite video game is “Honkai: Star Rail“.', metadata={'page': 0, 'source': 'pdf_docs/nn_story.pdf'}), 4.0), (Document(page_content='Denis Korolev is a highly qulified software engineer with wide range of skills:\n- Web

In [24]:
question_3 = "Does Nikolay have chance to use skills obtained during his study at university at work?"

template_3 = """Answer the following question based on the context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template_3)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question_3})

'Yes, Nikolay has the chance to use the skills obtained during his study at university at work. He graduated with a Bachelor\'s Degree in AeroSpace technology, and although he started his IT career as a Java Developer and then transitioned to DevOps, his daily workload now consists of supervision of K8S clusters and orchestration of Docker containers, which are related to his technical background in AeroSpace technology. Additionally, he completed an "AWS Technical Essentials" course, which further enhances his skills and knowledge in the IT field.'

In [25]:
final_rag_chain.invoke({"question": "Which of Ilya's yachts is more awesome and why?"})

'Based on the information provided, the yacht "Dona Xenia" owned by Ilya Pischalnikov is considered more awesome because it is longer at 73 meters compared to the "Nordev" yacht which is 65.5 meters long. Additionally, the interior design of "Dona Xenia" was done by the Georgian design agency "DenKoridzeN," which adds to its appeal and luxury.'

In [26]:
final_rag_chain.invoke({"question": "Provide Firstname and Lastname of all people from Irkutsk Region based on the information provided?"})

'Based on the information provided, the only person mentioned who is associated with Irkutsk Region is Denis Borodin.'

In [49]:
###################
#  Decomposition  #
###################

template_4 = """You are a helpful assistant that generates multiple sub-questions to an input question.\n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answered in isolation.\n
Generate multiple search queries related to: {question}\n
Output (3 queries):"""

prompt_decomposition = ChatPromptTemplate.from_template(template_4)

generate_queries_decomposition = (
    prompt_decomposition
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

question = "How to define that person is associated with specific region/state?"
questions = generate_queries_decomposition.invoke({"question": question})

print(questions)

["1. What are the common indicators or characteristics that can help determine a person's association with a specific region or state?", "2. How can one identify a person's connection to a particular region or state based on their cultural background or language spoken?", "3. Are there any specific methods or tools available to verify a person's affiliation with a particular region or state?"]


In [50]:
template_5 = """Here is the question you need to answer:

\n---\n {question} \n---\n

Here is any available background question + answer pairs:

\n---\n {q_a_pairs} \n---\n

Here is additional context relevant to the question:

\n---\n {context} \n---\n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template_5)

In [51]:
def format_qa_pair(question, answer):
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()


q_a_pairs = ""
for q in questions:
    rag_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question"), "q_a_pairs": itemgetter("q_a_pairs")}
        | decomposition_prompt
        | llm
        | StrOutputParser()
    )
    answer = rag_chain.invoke({"question": q, "q_a_pairs": q_a_pairs})
    q_a_pair = format_qa_pair(q, answer)
    q_a_pairs = q_a_pairs + "\n---\n" + q_a_pair

print(answer)

Based on the provided context and background information, there are several methods and tools that can be used to verify a person's affiliation with a particular region or state:

1. **Location of Residence**: One of the most straightforward methods is to verify where the person currently lives or has lived for a significant amount of time. For example, Denis Korolev lives in Georgia since 2023, indicating his affiliation with that region.

2. **Educational Background**: Another method is to look at where the person studied or received their degree from. For instance, Nikolay Nikitin graduated from Moscow Institute of Physics and Technology, suggesting a connection to Moscow.

3. **Professional Activities**: The type of work the person is involved in can also provide clues about their affiliation with a specific region. For example, Denis Borodin started his own law office in 2024, which could be linked to the region where he practices law.

4. **Cultural References**: Any mentions of 

In [52]:
final_answer = rag_chain.invoke(
    {
        "question": "Name all people who are associated with Irkutsk Region based on the information provided. Explain reasons for each person in 2-3 sentences?",
        "q_a_pairs": q_a_pairs
    }
)
print(final_answer)

1. Denis Borodin is associated with Irkutsk Region because he received his law degree from the University of Irkutsk. Additionally, he reactivated his lawyer's license after the arrest of Mr. Dadashevich in 2020, which led to the successful outcome of the case and the establishment of his own law office in 2024.

2. Pavel Bazhenov is associated with Irkutsk Region as he is a professional coach at the "ERMAK" hockey club in Angarsk. He started his career as a professional hockey player in the "First Hockey League" as part of the "ERMAK" hockey team, which indicates his strong connection to the region's sports community.

3. There are no individuals directly associated with Irkutsk Region based on the provided information.


In [53]:
###############
#  Step Back  #
###############

from langchain_core.prompts import FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "What can the members of The Police do?"
    },
    {
        "input": "Jan Sindel was born in what country?",
        "output": "What is Jan Sindel's personal history?"
    }
]

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a 
            more generic step-back question, which is easier to answer. Here is a few examples:"""
        ),
        few_shot_prompt,
        ("user", "{question}")
    ]
)

In [54]:
generate_queries_step_back = prompt | llm | StrOutputParser()
question = "How to define that person is associated with specific region/state?"
generate_queries_step_back.invoke({"question": question})

"How can someone's regional affiliation be determined?"

In [56]:
from langchain_core.runnables import RunnableLambda

response_prompt_template = """You are an expert at world knowledge. I am going to ask you a question. Your response should be
comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""

response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        "step_back_context": generate_queries_step_back | retriever,
        "question": lambda x: x["question"]
    }
    | response_prompt
    | llm
    | StrOutputParser()
)

chain.invoke({"question": question})

'To define that a person is associated with a specific region or state, several factors can be considered:\n\n1. **Residence**: One of the most straightforward ways to determine a person\'s association with a specific region or state is by their current residence. This can be established through official documents such as driver\'s licenses, utility bills, or property ownership records.\n\n2. **Workplace**: The location of a person\'s workplace can also indicate their association with a particular region or state. This can be determined through employment records, business registrations, or professional profiles.\n\n3. **Education**: The educational institutions a person has attended can provide clues about their association with a specific region or state. Alumni records or graduation certificates can confirm this connection.\n\n4. **Family Ties**: Family members or relatives residing in a particular region can also associate a person with that area. This can be established through fa

In [None]:
##########
#  HyDE  #
##########