In [1]:
import torch
from langchain_community.document_loaders import PyMuPDFLoader,DirectoryLoader
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader=DirectoryLoader('./data/',glob='./*.pdf',loader_cls=PyMuPDFLoader)
docs=loader.load()

In [3]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=800,chunk_overlap=160,separators=['\n\n','\n','.',' '])
chunks=text_splitter.split_documents(docs)
len(chunks)

558

In [4]:
embeddings=HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
vector_db=FAISS.from_documents(chunks,embeddings)
vector_db.save_local('Vector DB Index')

In [5]:
MODEL='Qwen/Qwen2.5-1.5B-Instruct'
tokenizer=AutoTokenizer.from_pretrained(MODEL)
model=AutoModelForCausalLM.from_pretrained(MODEL,device_map='auto',dtype=torch.bfloat16,low_cpu_mem_usage=True)
pipe=pipeline(task='text-generation',temperature=0.4,do_sample=True,tokenizer=tokenizer,model=model,max_new_tokens=512,repetition_penalty=1.1,
              no_repeat_ngram_size=3)

Device set to use cuda:0


In [6]:
rerank_model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cuda')

In [7]:
def format_prompt(question):
    initial_search=vector_db.similarity_search(query=question,k=25)
    pairs=[[question,doc.page_content] for doc in initial_search]
    scores=rerank_model.predict(pairs)
    scored_results=sorted(zip(scores,initial_search),key=lambda x:x[0],reverse=True)
    final_results=[doc for score,doc in scored_results[:5]]
    context_list=[]

    for doc in final_results:
        file_path=doc.metadata.get('source','unknown')
        filename=os.path.basename(file_path)
        page_num=doc.metadata.get('page',0) + 1
        header=f'[Doc:{filename} | Page:{page_num}]'
        context_list.append(f'{header}\n{doc.page_content}')
        context='\n\n-\n\n'.join(context_list)

        prompt=f'''You are an Experienced Financial Analyst.Your Job is to answer to the question using the CONTEXT provided
         CRITICAL:You must Always cite the source in the format [Doc:filename | Page:X]  Example: [Doc:XYZ.pdf | Page:23] 
         
         IMPORTANT RULES:
         1.The answer must not exceed 500 Words
         2.You must always cite the sources as instructed in the format [Doc:filename | Page:X] 
         3.You must always stick to the information provied in the document 
         4.Make sure you are factually accurate
         5.If the information is not present then say 'No information is present in the document' 
         
         Question:{question}

         Context:{context_list}

         Answer:'''    
        
        response=pipe(prompt,return_full_text=False)
        return response[0]['generated_text']

In [8]:
print(format_prompt('What is a VC Focused Prompt?Give an example of it'))

 A VC Focussed Prompt refers to a strategy or approach used by venture capital firms to select investments based on certain criteria or preferences. This could include factors such as the potential for growth, market size, competitive advantage, and alignment with the firm's investment goals. The term "VC Focused" suggests that this approach is specifically tailored towards venture capital activities.

An example of a VC focused prompt might be selecting investments based solely on their potential for rapid growth and strong market traction. This type of prompt would likely involve analyzing various metrics related to startup success rates, revenue projections, customer acquisition costs, and exit strategies. By focusing on these key indicators, venture capitalists can identify promising startups that align well with their investment objectives, thereby increasing the likelihood of successful exits and overall portfolio performance. In essence, a VC-focused prompt aims to streamline th

In [9]:
print(format_prompt('What are Infra Coins?'))

 InfraCoins refer to highly functional cryptocurrency assets that facilitate transactions on various blockchain platforms. They serve as essential components for utilizing these networks, enabling the execution of digital contracts, and serving as collateral. The classification of Infracoins into two main categories—Blockchain 2.x (such as Ethereum) and other Blockchain 1.0 alternatives or competitors like Cardano or Polkadot—is based on their role within expanding blockchain technology beyond Bitcoin's scope. This categorization reflects the effort to broaden the application of blockchain technology across different sectors and industries. No additional context was provided regarding specific details about Infra coins.

In summary, InfraCoins are crucial for operating and enhancing decentralized systems by providing necessary functionalities such as transaction facilitation and contract execution. Their classification into distinct subcategories helps differentiate between leading tec

In [10]:
print(format_prompt('Explain the complexity of financial decisions?'))

 The complexity of making financial decisions can be understood by examining how much computational effort they require. This concept has gained recognition as a significant factor affecting sound financial decision-making. However, there isn't a current theoretical framework specifically designed to characterize this complexity.

In our study, we developed a framework to quantify the complexity involved in financial decisions through their computational demands. Applying this framework to binary choices like whether to use a credit card, we found that complexity plays a crucial role in these decisions. Specifically, we observed that people tend to make errors more frequently when faced with complex options compared to simpler ones. Interestingly, we discovered that the burden of complexity is particularly heavy on individuals who carry balances on their cards, which means they face higher stakes due to potential interest charges.

Our findings indicate that while attention and financi

In [11]:
print(format_prompt('Explain utility tokens , also what is the difference between Blockchain 1.0,Blockchain 2.0 and Blockchain 3.0?'))

 Utility tokens are cryptographic assets designed to offer functional benefits within a particular ecosystem, platform, or project, distinct from "broken tokens." According to the context, true utility tokens align with previous SEC guidance on utility token offerings (UFOs). They lack investment potential, serve a consumptive purpose, and are promoted accordingly.

In contrast, blockchain technology has evolved through three distinct phases:

- **Blockchain 1.x**: This era began with Bitcoin and Ethereum's initial implementations. It was characterized by its focus on decentralization, security, and limited scalability issues. The primary goal was to create a decentralized digital currency system that could operate independently of traditional financial institutions.

- **Blockchains 2.x**: Emerging after the first wave, these blockchains aimed at addressing the limitations of Blockchains 1 by improving scalability, transaction speed, and interoperability. Innovations like Proof-of-Sta