In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/The Alchemy of Finance, Reading the Mind of the Market.pdf")
documents = loader.load()

In [3]:
import re
def preprocess_text(text):
    text_lower = text.lower()
    # only allow these characters
    text_no_punctuation = re.sub(r'[^\w\s\$\%\.\,\"\'\!\?\(\)]', '', 
                                 text_lower)
    # removes extra tabs space
    text_normalized_tabs = re.sub(r'(\t)+', '', text_no_punctuation)
    return text_normalized_tabs

for x in range(len(documents)):
    # do preprocessing
    documents[x].page_content=preprocess_text(documents[x].page_content)

In [4]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0,separator="\n")
docs = text_splitter.split_documents(documents)

In [7]:
docs.__len__()

912

In [8]:
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5",
                                   model_kwargs = {'device': "cpu"})
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="msft_data",
    force_recreate=True
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
query = "what is capital?"
found_docs = qdrant.similarity_search_with_score(query,k=3)
print(found_docs[0]) # print the first element

(Document(page_content='three elements the interest rate differential, the exchange rate \ndifferential, and the capital appreciation in local currency. since \nthe third element varies from case to case we can propose the \nfollowing general rule speculative capital is attracted by rising \nexchange rates and rising interest rates.', metadata={'source': '../data/The Alchemy of Finance, Reading the Mind of the Market.pdf', 'page': 76, '_id': '0c03ab0ba5174f37842444f815675c31', '_collection_name': 'msft_data'}), 0.5814092135454125)


In [13]:
def format_docs(query):
    found_docs = qdrant.similarity_search_with_score(query,k=1)
    return "\n\n".join(doc[0].page_content for doc in found_docs)

In [17]:
from gpt4all import GPT4All


model = GPT4All(model_name="mistral-7b-instruct-v0.1.Q4_0.gguf",
                             n_threads = 4,
                             allow_download=True)

Downloading: 100%|██████████| 4.11G/4.11G [12:14<00:00, 5.60MiB/s] 
Verifying: 100%|██████████| 4.11G/4.11G [01:56<00:00, 35.4MiB/s]


In [47]:
from langchain_community.llms import GPT4All

In [72]:
llm = GPT4All(
            model="mistral-7b-instruct-v0.1.Q4_0.gguf",
            #max_tokens=300,
            #n_threads = 4,
            #temp=0.3,
            #top_p=0.2,
            top_k=5,#40,
            #n_batch=8,
            #seed=100,
            allow_download=True,
            verbose=True)

In [73]:
from langchain.prompts import PromptTemplate
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain

In [74]:
template = """Use the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}
    Question: {question}

    Only return the helpful answer below and nothing else. Try to make it short. Maximum of 500 words.
    Helpful answer:
    """

In [75]:
rag_prompt = PromptTemplate(template=template, input_variables=["context","question"])


callbacks = [StreamingStdOutCallbackHandler()]
llm_chain = LLMChain(prompt=rag_prompt, llm=llm, verbose=True)

In [77]:
query = "What is capital?"
resp = llm_chain.invoke(
    input={"question":query,
           "context": format_docs(query)
          }
)
print(resp['text'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: three elements the interest rate differential, the exchange rate 
differential, and the capital appreciation in local currency. since 
the third element varies from case to case we can propose the 
following general rule speculative capital is attracted by rising 
exchange rates and rising interest rates.
    Question: What is capital?

    Only return the helpful answer below and nothing else. Try to make it short. Maximum of 500 words.
    Helpful answer:
    [0m

[1m> Finished chain.[0m

    Capital refers to the financial resources that a business or individual has available for investment in order to generate income or grow their assets over time. In the context of international trade, capital can also refer to the amoun