In [197]:
# !pip install "pymilvus[model]"
# !pip install ollama
# !pip install langchain

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from pymilvus import MilvusClient, model

from ollama import Client

In [267]:
def text_split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len
    )

    return splitter.split_documents(documents)

def load_and_split_documents(file_path):
    loader = PyMuPDFLoader(file_path)
    aapl_sec_filing_pages = loader.load()

    split_filing = text_split_documents(aapl_sec_filing_pages)

    document_content = [chunk.page_content for chunk in split_filing]
    document_source_names = [(chunk.metadata['source'].split('/')[-1] + f" (pg. {chunk.metadata['page']})") for chunk in split_filing]

    return document_content, document_source_names

def create_load_collection(collection_name, document_content):
    client = MilvusClient(f"{collection_name}.db")
    embedding_fn = model.DefaultEmbeddingFunction()

    if 'aapl_10k_collection' in client.list_collections():
        print(f'Using existing db: {collection_name}')
        client.load_collection(collection_name=f'{collection_name}_collection')
    else:
        print('Creating new vector db for documents...')

        vectors = embedding_fn.encode_documents(document_content)
        dims = embedding_fn.dim

        client.create_collection(
            collection_name=f"{collection_name}_collection",
            dimension=dims
        )

        data = [
            {"id": i, "vector": vectors[i], "text": document_content[i]}
            for i in range(len(vectors))
        ]

        client.insert(collection_name=f"{collection_name}_collection", data=data)

        print('Created db!')

    return client

def query_collection(questions, client, top_k=5):
    embedding_fn = model.DefaultEmbeddingFunction()
    query_vectors = embedding_fn.encode_queries(questions)

    res = client.search(
        collection_name="aapl_10k_collection",
        data=query_vectors,
        limit=top_k,
        output_fields=["id", "text"],
    )

    return res

def generate_llm_response(query_response, document_source_names, questions):
    llm_reponses = []
    q_num = 0
    for res in query_response:
        context = []
        sources = []
        for chunk in res:
            sources.append(chunk['id'])
            context.append(chunk['entity']['text'])
        
        PROMPT_TEMPATE = f"""
    You are a financial analyst who has extensive knowledge of financial markets and 
    specialize in understanding SEC filings. You are only given the following chunks of context to 
    answer any questions:

    {context}"""
        

        client = Client(host='http://localhost:11434')
        response = client.chat(model='llama3', messages=[
            {
                'role': 'system',
                'content': PROMPT_TEMPATE,
            },
            {
                'role': 'user',
                'content': questions[q_num],
            }
        ])

        sources_str = ""

        for idx in sources:
            sources_str += document_source_names[idx] + ', '

        output_str = "Answer: "

        output_str += response['message']['content']

        output_str += '\nSources: ' + sources_str

        llm_reponses.append(output_str)
        q_num += 1

    return llm_reponses

def generate_completion(file_path, questions):
    collection_name = 'aapl_10k'

    print('Loading and splitting your doc...')
    document_content, document_source_names = load_and_split_documents(file_path)
    print('Doc loaded!\n')

    client = create_load_collection(collection_name, document_content)

    print('Querying db...')
    q_res = query_collection(questions, client)

    print('Generating reponse...\n')
    llm_reponses = generate_llm_response(q_res, document_source_names, questions)

    q_count = 1
    for answr in llm_reponses:
        print(f'------------------------------------------------Question {q_count}:------------------------------------------------\n')
        print(answr)
        print('\n')
        q_count += 1

In [271]:
questions = ["What was Apple's net income this year?",
             "Did Apple experience any significant losses?",
             "Did Apple pay out any dividends this year?"]

generate_completion('INSERT PDF PATH HERE', questions)