In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from PyPDF2 import PdfReader
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForQuestionAnswering
import faiss
import torch

In [13]:
base_path = r'C:\Users\vivek_pankaj\Desktop\DAP\dap_app_v1.0\src\files'
file_paths = [os.path.join(base_path, file) for file in os.listdir(base_path) if file.endswith('.pdf')]

In [14]:
class FAISSModel:
    def __init__(self, model_name, file_paths):
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data_process(file_paths)
        self.build_index(self.split_texts)

    
    def splitter(self, text):
        character_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", ".", " ", ""],
            chunk_size=400,
            chunk_overlap=100
        )
        sentence_splitter = SentenceTransformersTokenTextSplitter(
            tokens_per_chunk=256,
            chunk_overlap=50
        )

        character_split_texts = character_splitter.split_text(text)
        token_split_texts = []
        for character_split_text in character_split_texts:
            token_split_texts.extend(sentence_splitter.split_text(character_split_text))
        
        self.split_texts = token_split_texts
    
    def data_process(self, file_paths):
        total_text = '\n\n'.join([self.extract_pdf(file_path) for file_path in file_paths])
        self.splitter(total_text)

    def build_index(self, split_texts):
        embeddings = self.generate_embeddings(split_texts)
        self.index = faiss.IndexFlatIP(self.model.config.hidden_size)
        self.index.add(embeddings)
    
    def get_best_match(self, query):
        query_embeddings = self.generate_embeddings([query])
        D, I = self.index.search(query_embeddings, 1)
        return self.split_texts[I[0][0]]

    def get_k_matches(self, query, k):
        query_embeddings = self.generate_embeddings([query])
        D, I = self.index.search(query_embeddings, k)
        return [self.split_texts[i] for i in I[0]]
    
    def generate_embeddings(self, chunks):
        inputs = self.tokenizer(chunks, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.numpy()
    
    def check_pdf_exists(self, file_path: str) -> bool:
        '''
        Check if the file exists and is a pdf file
        '''
        if os.path.isfile(file_path) and file_path.lower().endswith('.pdf'):
            return True
        return False


    def extract_pdf(self, file_path: str) -> str | None:
        '''
        Extract text from a pdf file
        '''
        if self.check_pdf_exists(file_path):
            with open(file_path, 'rb') as file:
                pdf = PdfReader(file)
                text = ''
                for page in pdf.pages:
                    text += page.extract_text()
                return text
        return None

In [15]:
faiss_model = FAISSModel('sentence-transformers/all-MiniLM-L6-v2', file_paths)



In [16]:
query = 'Did base salaries increase from fiscal year 2022 to fiscal year 2023?'
print(faiss_model.get_k_matches(query, 5))

['fiscal year 2023 compared with fiscal year 2022 interest and dividends income increased due to higher yields, offset in part by lower portfolio balances. interest expense decreased due to a decrease in outstanding long - term debt due to debt maturities. net recognized gains on investments', 'june 30, 2023 changes in fair value recorded in other comprehensive income commercial paper level 2 $ 16, 589 $ 0 $ 0 $ 16, 589 $ 12, 231 $ 4, 358 $ 0 certificates of deposit level 2 2, 701 0 0 2, 701 2, 657 44 0', 'in order to manage our costs in a dynamic, competitive environment, in fiscal year 2023 we announced that base salaries of salaried employees would remain at fiscal year 2022 levels. pay increases continue to be available for rewards - eligible hourly and eq uivalent employees. we will continue our practice of investing in stock for all rewards - eligible employees,', 'segment revenue and operating income were as follows during the periods presented : ( in millions ) year ended june 

In [21]:
import json
import requests

In [23]:
os.environ['ARLI_API_KEY'] = 'e8ad995f-a8c6-4814-b068-5e65be41c605'

In [24]:
def process_arli_response(response: str) -> list:
    k = list(response.split('data: '))
    k = filter(lambda x: x != '', k)
    arr = []
    for obj in k:
        try:
            arr.append(json.loads(obj))
        except json.JSONDecodeError:
            continue
    return ''.join([obj['choices'][0]['text'] for obj in arr])

def generate_prompt(user_input: str, system_msg: str) -> str:
    return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

def arli_pipeline(user_input: str, system_msg: str = "You are a useful AI assistant") -> str:
    url = "https://api.arliai.com/v1/completions"

    payload = json.dumps({
    "model": "Meta-Llama-3.1-8B-Instruct",
    "prompt": generate_prompt(user_input=user_input, system_msg=system_msg),
    "repetition_penalty": 1.1,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 40,
    "max_tokens": 1024,
    "stream": True
    })
    headers = {
    'Content-Type': 'application/json',
    'Authorization': f"Bearer {os.environ['ARLI_API_KEY']}"
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    return process_arli_response(response.text)

In [76]:
os.environ['TOGETHER_API_KEY'] = '5ddfdebe1a39c353e040a6e8de725209be9022fea728ff075707488ad8423fef'

In [79]:
from together import Together
def together_rag_pipeline(query: str, context: str) -> str:
    client = Together()

    prompt = f"""
    Answer the following question based on the given context:
    {context}
    Question: {query}
    Answer:
    """
    completion = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "system", "content": "You are a useful AI assistant"},
                  {"role": "user", "content": prompt}]
    )

    return completion.choices[0].message.content


In [77]:
from together import Together

client = Together()

completion = client.chat.completions.create(
  model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
  messages=[{"role": "user", "content": "What are the top 3 things to do in New York?"}],
)


In [53]:
completion.choices[0].message.content

"New York City is a vibrant and diverse metropolis with countless attractions and activities to suit all interests. Here are three top things to do in New York:\n\n1. **Visit the Statue of Liberty and Ellis Island**: Take a ferry to Liberty Island to see the iconic Statue of Liberty up close and learn about its history and significance. You can also visit the Ellis Island Immigration Museum to explore the history of immigration in the United States.\n\n2. **Explore the Metropolitan Museum of Art**: The Met, as it's commonly known, is one of the world's largest and most famous museums, with a collection that spans over 5,000 years of human history. From ancient Egyptian artifacts to modern and contemporary art, the Met has something for everyone.\n\n3. **Walk across the Brooklyn Bridge**: Take in the stunning views of the Manhattan skyline and the East River by walking across the iconic Brooklyn Bridge. You can also stop at the Brooklyn Bridge Park for a picnic or a stroll along the wat

In [67]:
class LLMV2:
    def _init__(self):
        pass
    
    def response(self, query: str, context: str) -> str:
        prompt = f"""Context: {context} \n\nQuery: {query} Answer:"""
        return arli_pipeline(user_input=prompt, system_msg="You are a useful AI assistant which reads the context and answers the query")
    

In [68]:
context_lst = ["There are 1000 people working in Merilytics.", "Merilytics is a data analytics company.", "Merilytics is based in Hyderabad."]
llm = LLMV2()
context = "\n\n".join(context_lst)
print(llm.response("How many people work in Merilytics?", context))

According to the given context, there are 1000 people working in Merilytics.


In [70]:
print(llm.response("What type of company is Merilytics?", context))

Merilytics is a data analytics company.


In [51]:
def query_transformation(query: str) -> str:
    prompt = f"""
    Rephrase the question to better structure so that it returns better answers. Return two queries in json format. Do not use abbreviations or acronyms. Do not change the meaning of the question.
    At the start of the json, provide <JSON> and at the end provide </JSON>
    \n\n\n
    Question: {query}
    \n\n\n
    Answer:
    """
    print(prompt)
    system_msg = "You are a useful AI assistant"
    return arli_pipeline(user_input=prompt, system_msg=system_msg)

In [44]:
def statement_assumption(query: str) -> str:
    prompt = f"""
    Generate a hypothetical answer to the following query in one or two sentences.
    \n\n
    Query: {query}
    \n\n
    Answer:
    """
    system_msg = "You are a useful AI assistant"
    return arli_pipeline(user_input=prompt, system_msg=system_msg)

In [53]:
query = "Did base salaries increase from fiscal year 2022 to fiscal year 2023?"
k = query_transformation(query)
# print(statement_assumption(query))


    Rephrase the question to better structure so that it returns better answers. Return two queries in json format. Do not use abbreviations or acronyms. Do not change the meaning of the question.
    At the start of the json, provide <JSON> and at the end provide </JSON>
    



    Question: Did base salaries increase from fiscal year 2022 to fiscal year 2023?
    



    Answer:
    


In [39]:
query = "According to our internal data analysis, base salaries increased by an average of 4.5% across various industries and sectors from fiscal year 2022 to fiscal year 2023, driven primarily by inflation adjustments and market rate changes. However, certain high-growth fields such as technology and healthcare saw even more substantial increases, averaging around 7-8% during this period."
print(faiss_model.get_k_matches(query, 5))

['in order to manage our costs in a dynamic, competitive environment, in fiscal year 2023 we announced that base salaries of salaried employees would remain at fiscal year 2022 levels. pay increases continue to be available for rewards - eligible hourly and eq uivalent employees. we will continue our practice of investing in stock for all rewards - eligible employees,', 'fiscal year 2023 compared with fiscal year 2022 interest and dividends income increased due to higher yields, offset in part by lower portfolio balances. interest expense decreased due to a decrease in outstanding long - term debt due to debt maturities. net recognized gains on investments', 'increased to 67. 0 million. • linkedin revenue increased 10 %. • dynamics products and cloud services revenue increased 16 % driven by dynamics 365 growth of 24 %. • server products and cloud services revenue increased 19 % driven by azure and other cloud services growth of 29 %. • windows original equipment manufacturer licensing

In [54]:
print(k)

<JSON>

{
  "query1": "What is the average annual salary for employees as of the end of fiscal year 2022 compared to the same metric for fiscal year 2023?",
  "query2": "Were there any noticeable changes in the frequency or amount of base salary increases given to employees during the transition from fiscal year 2022 to fiscal year 2023?"
}

</JSON>


In [57]:
def parse_query_transformation(response: str) -> dict:
    start_tag = "<JSON>"
    end_tag = "</JSON>"
    response = response.strip()
    start = response.find(start_tag)
    end = response.find(end_tag)
    if start == -1 or end == -1:
        return {}
    substring = response[start + len(start_tag):end].strip()
    return json.loads(substring)

In [59]:
c = statement_assumption(query)

In [63]:
queries = parse_query_transformation(k)
queries['query3'] = c

In [64]:
print(queries)

{'query1': 'What is the average annual salary for employees as of the end of fiscal year 2022 compared to the same metric for fiscal year 2023?', 'query2': 'Were there any noticeable changes in the frequency or amount of base salary increases given to employees during the transition from fiscal year 2022 to fiscal year 2023?', 'query3': 'According to our hypothetical data, base salaries for most industries increased by an average of 4.5% from fiscal year 2022 to fiscal year 2023, with some sectors experiencing even higher growth rates due to market demand and inflationary pressures. This upward trend was driven primarily by efforts to retain top talent and keep pace with rising living costs.'}


In [71]:
from collections import Counter
def get_big_context(queries: list[str]) -> str:
    context_lst = []
    for query in queries:
        context_lst.extend(faiss_model.get_k_matches(query, 5))
    
    counter = Counter(context_lst)
    ret_context = ''
    best_n_context = counter.most_common(5)
    for context, _ in best_n_context:
        ret_context += context + '\n\n'
    return ret_context
        

In [82]:
big_context = get_big_context(list(queries.values()))

In [84]:
k = together_rag_pipeline(query=query, context=big_context)

In [85]:
print(k)

No, base salaries of salaried employees remained at fiscal year 2022 levels in fiscal year 2023.


In [86]:
def advanced_rag_pipeline(query: str) -> str:
    transformed_queries = query_transformation(query)
    queries = parse_query_transformation(transformed_queries)
    statement_assumption_query = statement_assumption(query)
    queries['query3'] = statement_assumption_query
    big_context = get_big_context(list(queries.values()))
    return together_rag_pipeline(query=query, context=big_context)

In [87]:
class RAG2:
    def __init__(self):
        pass

    def query_transformation(self, query):
        prompt = f"""
        Rephrase the question to better structure so that it returns better answers. Return two queries in json format. Do not use abbreviations or acronyms. Do not change the meaning of the question.
        At the start of the json, provide <JSON> and at the end provide </JSON>
        \n\n\n
        Question: {query}
        \n\n\n
        Answer:
        """
        system_msg = "You are a useful AI assistant"
        return self.arli_pipeline(user_input=prompt, system_msg=system_msg)
    
    def arli_pipeline(self, user_input: str, system_msg: str = "You are a useful AI assistant") -> str:
        url = "https://api.arliai.com/v1/completions"

        payload = json.dumps({
        "model": "Meta-Llama-3.1-8B-Instruct",
        "prompt": self.generate_prompt(user_input=user_input, system_msg=system_msg),
        "repetition_penalty": 1.1,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 40,
        "max_tokens": 1024,
        "stream": True
        })
        headers = {
        'Content-Type': 'application/json',
        'Authorization': f"Bearer {os.environ['ARLI_API_KEY']}"
        }

        response = requests.request("POST", url, headers=headers, data=payload)
        return self.process_arli_response(response.text)
    
    def process_arli_response(self, response: str) -> list:
        k = list(response.split('data: '))
        k = filter(lambda x: x != '', k)
        arr = []
        for obj in k:
            try:
                arr.append(json.loads(obj))
            except json.JSONDecodeError:
                continue
        return ''.join([obj['choices'][0]['text'] for obj in arr])
    
    def parse_query_transformation(self, response: str) -> dict:
        start_tag = "<JSON>"
        end_tag = "</JSON>"
        response = response.strip()
        start = response.find(start_tag)
        end = response.find(end_tag)
        if start == -1 or end == -1:
            return {}
        substring = response[start + len(start_tag):end].strip()
        return json.loads(substring)
    
    def statement_assumption(self, query: str) -> str:
        prompt = f"""
        Generate a hypothetical answer to the following query in one or two sentences.
        \n\n
        Query: {query}
        \n\n
        Answer:
        """
        system_msg = "You are a useful AI assistant"
        return self.arli_pipeline(user_input=prompt, system_msg=system_msg)
    
    def get_big_context(self, queries: list[str]) -> str:
        context_lst = []
        for query in queries:
            context_lst.extend(faiss_model.get_k_matches(query, 5))

        counter = Counter(context_lst)
        ret_context = ''
        best_n_context = counter.most_common(5)
        for context, _ in best_n_context:
            ret_context += context + '\n\n'
        return ret_context
    
    def advanced_rag_pipeline(self, query: str) -> str:
        transformed_queries = self.query_transformation(query)
        queries = self.parse_query_transformation(transformed_queries)
        statement_assumption_query = self.statement_assumption(query)
        queries['query3'] = statement_assumption_query
        big_context = self.get_big_context(list(queries.values()))
        return self.together_rag_pipeline(query=query, context=big_context)
    
    def generate_prompt(self, user_input: str, system_msg: str) -> str:
        return f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    
    def together_rag_pipeline(self, query: str, context: str) -> str:
        client = Together()

        prompt = f"""
        Answer the following question based on the given context:
        {context}
        Question: {query}
        Answer:
        """
        completion = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            messages=[{"role": "system", "content": "You are a useful AI assistant"},
                    {"role": "user", "content": prompt}]
        )

        return completion.choices[0].message.content
    
    def response(self, query: str) -> str:
        return advanced_rag_pipeline(query)

In [88]:
rag = RAG2()
print(rag.response("Who are the competitors to Microsoft in middleware services?"))


    Rephrase the question to better structure so that it returns better answers. Return two queries in json format. Do not use abbreviations or acronyms. Do not change the meaning of the question.
    At the start of the json, provide <JSON> and at the end provide </JSON>
    



    Question: Who are the competitors to Microsoft in middleware services?
    



    Answer:
    
According to the given context, the competitors to Microsoft in middleware services are Java vendors.
