In [1]:
import torch
from langchain_community.document_loaders import PyMuPDFLoader,DirectoryLoader
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader=DirectoryLoader('./data/',glob='./*.pdf',loader_cls=PyMuPDFLoader)
docs=loader.load()

In [3]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=800,chunk_overlap=160,separators=['\n\n','\n','.',' '])
chunks=text_splitter.split_documents(docs)
len(chunks)

558

In [4]:
embeddings=HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
vector_db=FAISS.from_documents(chunks,embeddings)
vector_db.save_local('Vector DB Index')

In [5]:
MODEL='Qwen/Qwen2.5-1.5B-Instruct'
tokenizer=AutoTokenizer.from_pretrained(MODEL)
model=AutoModelForCausalLM.from_pretrained(MODEL,device_map='auto',dtype=torch.bfloat16,low_cpu_mem_usage=True)
pipe=pipeline(task='text-generation',temperature=0.4,do_sample=True,tokenizer=tokenizer,model=model,max_new_tokens=512,repetition_penalty=1.1,
              no_repeat_ngram_size=3)

Device set to use cuda:0


In [6]:
rerank_model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cuda')

In [7]:
def format_prompt(question):
    initial_search=vector_db.similarity_search(query=question,k=25)
    pairs=[[question,doc.page_content] for doc in initial_search]
    scores=rerank_model.predict(pairs)
    scored_results=sorted(zip(scores,initial_search),key=lambda x:x[0],reverse=True)
    final_results=[doc for score,doc in scored_results[:5]]
    context_list=[]

    for doc in final_results:
        file_path=doc.metadata.get('source','unknown')
        filename=os.path.basename(file_path)
        page_num=doc.metadata.get('page',0) + 1
        header=f'[Doc:{filename} | Page:{page_num}]'
        context_list.append(f'{header}\n{doc.page_content}')
        context='\n\n-\n\n'.join(context_list)

        prompt=f'''You are an Experienced Financial Analyst.Your Job is to answer to the question using the CONTEXT provided
         CRITICAL:You must Always cite the source in the format [Doc:filename | Page:X]  Example: [Doc:XYZ.pdf | Page:23] 
         
         IMPORTANT RULES:
         1.The answer must not exceed 500 Words
         2.You must always cite the sources as instructed in the format [Doc:filename | Page:X] 
         3.You must always stick to the information provied in the document 
         4.Make sure you are factually accurate
         5.If the information is not present then say 'No information is present in the document' 
         
         Question:{question}

         Context:{context}

         Answer:'''    
        
        response=pipe(prompt,return_full_text=False)
        return response[0]['generated_text']

In [8]:
print(format_prompt('What is a VC Focused Prompt?Give an example of it'))

 A VC Focussed Prompt refers to a strategy or model designed specifically for venture capital firms. It involves generating investment decisions based on factors unique to this industry, such as the need for early-stage funding, the importance of network effects, and the potential for rapid growth. This approach aims to capture the specific dynamics and opportunities within the venture capital sector.

An example of a VC focused prompt could be one that prioritizes investments in startups with strong networks, particularly those operating in technology sectors. Such a prompt might consider metrics like the number of connections between founders and key stakeholders, the size of the team, and past success rates among similar ventures. The goal would be to identify companies that have the best chance of achieving significant market impact through their innovative products or services, even if they lack substantial financial backing at the outset. This type of prompt would likely involve 

In [9]:
print(format_prompt('What are Infra Coins?'))

 InfraCoins are highly functional cryptocurrency assets that facilitate transactions on blockchain networks. They serve as "digital gas" for these networks, enabling users to pay transaction fees and power smart contracts. The two main categories of Infracoins include Ethereum's version (3a) and other competing platforms like Cardano (3b) and Polkadot (3c). These coins play a crucial role in expanding blockchain technology beyond just Bitcoin by offering more advanced functionalities and capabilities. Their classification reflects their status as foundational elements within the broader ecosystem of decentralized finance and blockchain development. [Doc:CryptocurrencyUnifiedMarketStructurePaper.pdf |Page:29]

[Doc:CoinDesk.com | Page:N/A] CoinDesk defines InfraCoins as “cryptocurrencies used to fuel the network’s infrastructure.” This definition aligns well with the context provided, emphasizing how Infra coins act as essential resources for running and maintaining blockchain networks.

In [10]:
print(format_prompt('Explain the complexity of financial decisions?'))

 The complexity of making financial decisions has been identified as a significant barrier to effective financial management. This concept is crucial because it highlights how various factors can complicate the process of managing money, leading to potential errors and misjudgments.

The proposed framework by Lee et al. (2019) offers a method to quantify this complexity, focusing specifically on computational demands rather than other common barriers like inattention due to cognitive overload or limited financial knowledge. By applying this framework to binary choice scenarios such as whether to use a credit card, they demonstrate that complexity plays a pivotal role in these decisions.

Their research indicates that complexity influences behavior more profoundly among individuals with high stakes involved in their financial decisions. Specifically, those carrying balances face greater challenges due to higher risk exposure when making credit card usage decisions. These findings unders

In [11]:
print(format_prompt('Explain utility tokens , also what is the difference between Blockchain 1.0,Blockchain 2.0 and Blockchain 3.0?'))

Utility tokens are crypto-assets that aim to provide utility or functionality within a particular ecosystem, platform, or project, according to the context provided. They are characterized by their intent to offer practical benefits rather than speculative gains. The term "utility token" is distinct from "broken tokens," which refers to tokens that may be fraudulent or misused.

The main differences between Blockchain versions can be summarized as follows:

- **Blockchain 1.x**: This version typically represents early blockchain implementations without smart contracts. It was primarily focused on peer-to-peer networks for secure transactions but lacked advanced features like decentralized applications (dApps), consensus mechanisms, and complex functionalities.

- **Blockchains 2.x**: These blockchains incorporate more sophisticated technologies such as smart contracts, improved scalability solutions, and enhanced security protocols. They support dApps, enabling developers to build comp