In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import HuggingFaceHub
import pinecone
from dotenv import load_dotenv
import os

directory = 'D:/Project/llm/data/'
txt_books = os.listdir(directory)


  from tqdm.autonotebook import tqdm


In [2]:
directory = 'data/'
txt_books = os.listdir(directory)
txt_books

['Module 10_Trading Systems.pdf',
 'Module 1_Introduction to Stock Markets.pdf',
 'Module 2_Technical Analysis.pdf',
 'Module 3_Fundamental Analysis.pdf',
 'Module 4_Futures Trading.pdf',
 'Module 5_Options-Theory-for-Professional-Trading.pdf',
 'Module 6_Option Strategies.pdf',
 'Module 7_Markets & Taxation.pdf',
 'Module 8_Currency and Commodity Futures.pdf',
 'Module 9_Risk Management & Trading Psychology.pdf',
 'Module11_Personal-Finance.pdf',
 'R G Hagstrom - The Warren Buffett Way  2nd Edition.pdf',
 'Stock investing for Dummies.pdf']

In [13]:
class ChatBot:
    def __init__(self):
        load_dotenv()

        directory = 'D:/Project/llm/data/'
        txt_books = os.listdir(directory)

        documents = []
        for book in txt_books:
            loader = PyPDFLoader(os.path.join(directory, book))
            documents.extend(loader.load_and_split())

        embeddings = HuggingFaceEmbeddings()

        pinecone.init(
            api_key=os.getenv('PINECONE_API_KEY'),
            environment='gcp-starter'
        )

        index_name = "langchain-demo"

        if index_name not in pinecone.list_indexes():
            pinecone.create_index(name=index_name, metric="cosine", dimension=768)
            docsearch = Pinecone.from_documents(documents, embeddings, index_name=index_name)
        else:
            docsearch = Pinecone.from_existing_index(index_name, embeddings)

        repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
        llm = HuggingFaceHub(
            repo_id=repo_id, model_kwargs={"temperature": 0.8, "top_p": 0.8, "top_k": 50},
            huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_KEY')
        )

        from langchain.schema.runnable import RunnablePassthrough
        from langchain.schema.output_parser import StrOutputParser
        from langchain.prompts import PromptTemplate

        template = """
        You are a financial advisor. These humans will ask you questions about finance. Use the following piece of context to answer the question. 
        If you don't know the answer, just say you don't know. 
        Answer the questions in such detail that even a beginner can understand it.
        Always provide the full response without cutting in-between.

        Context: {context}
        Question: {question}
        Answer: 

        """

        prompt = PromptTemplate(template=template, input_variables=["context", "question"])

        self.rag_chain = (
            {"context": docsearch.as_retriever(), "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )


In [14]:
bot = ChatBot()
input_txt = input("Ask me anything: ")
result = bot.rag_chain.invoke(input_txt)
print(result)


        You are a financial advisor. These humans will ask you questions about finance. Use the following piece of context to answer the question. 
        If you don't know the answer, just say you don't know. 
        Answer the questions in such detail that even a beginner can understand it.
        Always provide the full response without cutting in-between.

        Context: [Document(page_content='stock. Needless to say, this is highly counter intuitive and it takes years of \ninves tment practice to internalize this fact.  \nAnyway, moving ahead the best source to get information related to the business is \nthe company’s website and its annual report. We need to study at least the last 5 \nyear annual report to understand how the company i s evolving across business \ncycles.  \n \n13.3 – Understanding the Business  \nAs a first step towards understanding the business, we need to make a list of \nquestions for which we need to find answers to. Do note, the answers to all these

In [7]:
pinecone.list_indexes()

['langchain-demo']

In [8]:
directory = 'D:/Project/llm/data/'
txt_books = os.listdir(directory)

documents = []
# for book in txt_books:
loader = PyPDFLoader(os.path.join(directory, 'Module 3_Fundamental Analysis.pdf'))
    # documents.append(loader.load_and_split())

loader = loader.load_and_split()

loader

[Document(page_content='Module 3  \nFundamental Analysis', metadata={'source': 'D:/Project/llm/data/Module 3_Fundamental Analysis.pdf', 'page': 0}),
 Document(page_content='Module 3  — Fundamental Analysis  \nChapter 1  \nIntroduction to Fundamental Analysis  \n83 \n \n1.1 – Overview  \nFundamental Analysis (FA) is a holistic approach to study a business. When an \ninvestor wishes to invest in a business for the long term (say 3 – 5 years) it becomes \nextremely essential to understand the business from various perspectives. It is \ncritical for an investor to separate the daily short term noise in the stock prices and \nconcentrate on the u nderlying business performance. Over the long term, the stock \nprices of a fundamentally strong company tend to appreciate, thereby creating \nwealth for its investors.  \nWe have many such examples in the Indian market. To name a few, one can think of \ncompanies  such as Infosys Limited, TCS Limited, Page Industries, Eicher Motors, \nBosch India

In [9]:
embeddings = HuggingFaceEmbeddings()
# pinecone.create_index(name='langchain-demo', metric="cosine", dimension=768)
docsearch = Pinecone.from_documents(loader, embeddings, index_name= 'langchain-demo')
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x1d58ad96050>

In [12]:
docsearch.as_retriever().invoke("Stocks")

[Document(page_content='stock. Needless to say, this is highly counter intuitive and it takes years of \ninves tment practice to internalize this fact.  \nAnyway, moving ahead the best source to get information related to the business is \nthe company’s website and its annual report. We need to study at least the last 5 \nyear annual report to understand how the company i s evolving across business \ncycles.  \n \n13.3 – Understanding the Business  \nAs a first step towards understanding the business, we need to make a list of \nquestions for which we need to find answers to. Do note, the answers to all these \nquestions can be found ou t by reading through the company’s annual report and \nwebsite.  \nHere are a bunch of questions that I think helps us in our quest to understand the \nbusiness. I have discussed the rationale behind each question.  \nSl \nNo Question  Rational behind the question  \n1 What does the company do?  To get a basic understanding of the business  \n2 Who are 