In [None]:
#Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

# Load the JSON data from the file
with open('/content/drive/My Drive/finqa/train.json') as f:
    data = json.load(f)

# Initialize a new list to store cleaned instances
cleaned_data = []

# Iterate through each instance and concatenate 'pre_text' and 'post_text' into 'text'
for instance in data:
    concatenated_text = instance.get("pre_text", []) + instance.get("post_text", [])

    cleaned_instance = {
        "text": concatenated_text,  # Concatenate pre_text and post_text
        "table": instance.get("table"),
        "filename": instance.get("filename")
    }
    cleaned_data.append(cleaned_instance)

# Save the cleaned data to a new JSON file
with open('/content/drive/My Drive/finqa/clean/train_cleaned.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

print("Cleaned data with concatenated text saved to 'cleaned_train.json'")


Cleaned data with concatenated text saved to 'cleaned_train.json'


In [None]:
with open('/content/drive/My Drive/finqa/test.json') as f:
    data = json.load(f)

# Initialize a new list to store cleaned instances
cleaned_data = []

# Iterate through each instance and concatenate 'pre_text' and 'post_text' into 'text'
for instance in data:
    concatenated_text = instance.get("pre_text", []) + instance.get("post_text", [])

    cleaned_instance = {
        "text": concatenated_text,  # Concatenate pre_text and post_text
        "table": instance.get("table"),
        "filename": instance.get("filename")
    }
    cleaned_data.append(cleaned_instance)

# Save the cleaned data to a new JSON file
with open('/content/drive/My Drive/finqa/clean/test_cleaned.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

print("Cleaned data with concatenated text saved to 'cleaned_test.json'")

Cleaned data with concatenated text saved to 'cleaned_test.json'


In [None]:
#Installing Llama-index
!pip install llama-index



In [None]:
#turning the data in the files into Document objects

from llama_index.core import Document

with open('/content/drive/My Drive/finqa/clean/train_cleaned.json') as f:
    data = json.load(f)

documents = []
for ele in data:
    metadata = {"table":str(ele.get("table", []))[:950], "filename":ele.get("filename", "unknown")} #using table and filename as metadata
    for text_chunk in ele.get("text", []):
        doc = Document(text=text_chunk, metadata=metadata)
        documents.append(doc)

In [None]:
len(documents)

151725

In [None]:
#installing libraries required for Qdrant vector storage and embedding libraries for embedding text

%pip install llama-index-vector-stores-qdrant llama-index-readers-file llama-index-embeddings-fastembed llama-index-llms-openai
%pip install llama-index-embeddings-huggingface



In [None]:
#importing relevant libraries for embedding and Qdrant Vector store

import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings


In [None]:
len(documents)

151725

In [None]:
#initialising Qdrant client

client = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    location=":memory:"
    # otherwise set Qdrant instance address with:
    # url="http://:"
    # otherwise set Qdrant instance with host and port:
    # host="localhost",
    # port=6333
    # set API KEY for Qdrant Cloud
    # api_key=""
)

In [None]:
#Initialising the embedding model.

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#indexing. adding all of the document objects that have been embedded into the vector database
vector_store = QdrantVectorStore(client=client, collection_name="collection")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress=True
)

Parsing nodes:   0%|          | 0/151725 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]



Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/173 [00:00<?, ?it/s]

In [None]:
#importing library for retriver
from llama_index.core.retrievers import VectorIndexRetriever

In [None]:
#installing libraries to use groq platform.
! pip install llama_index_llms_groq



In [None]:
from llama_index.llms.groq import Groq
from pprint import pprint
llm = Groq(model="llama3-8b-8192", api_key="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") #initialising llm
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") #using sentence transformers embedding model
retriever = VectorIndexRetriever(index=index, embed_model=embed_model, similarity_top_k=5) #initialising retirever that will be used to fetch relevant documents

def run(query):
    nodes = retriever.retrieve(query)
    text_data = "" #data will be stored in a string that the llm will read
    meta = str(nodes[0].metadata)
    for node in nodes: #concatenating text and metadata into a single string that the llm can read and use as context
        text_data += node.text
        if str(node.metadata) not in meta:
            meta+=str(node.metadata)

    context = text_data + meta
    pprint(context) # printing the context to see where the data is coming from

    return llm.complete(prompt.format(question=query, context=context))  # The llm uses the prompt template to understand the question and the data the retirever is giving it to answer from




In [None]:
#prompt is used as a template for the llm to give answers with.
prompt = """
You are an expert financial assistant who answers questions about financial documents.
Given the question:
{question}

and the context:
{context}


answer the question comprehensively. If the context is not helpful in answering the question, say "I don't know", do not make up an answer.
It is very important to my career that you be truthful or I will lose my job.

Do not answer with "I am happy to answer..." or "Sure, I will...", be to the point and only return the answer.
"""

In [None]:
ans = run("What is the total fair value of the vested stocks of abiomed inc in 2010, 2011 and 2012")
ans.text

('1913527 54 the total fair value of restricted stock , restricted stock units '
 ', and performance shares that vested during the years ended december 31 , '
 '2012 , 2011 and 2010 , was $ 20.9 million , $ 11.6 million and $ 10.3 '
 'million , respectively .the total fair value of time-vested restricted stock '
 'units vested during 2009 , 2008 and 2007 was $ 29535 , $ 26674 and $ 3392 , '
 'respectively .{\'table\': "[[\'\', \'number of shares\', '
 "'weightedaveragegrant datefair value'], ['outstanding at december 31 2011', "
 "'1432610', '$ 57'], ['granted', '1073798', '54'], ['vested', '-366388 ( "
 "366388 )', '55'], ['cancelled', '-226493 ( 226493 )', '63'], ['outstanding "
 'at december 31 2012\', \'1913527\', \'54\']]", \'filename\': '
 '\'CME/2012/page_107.pdf\'}{\'table\': "[[\'\', \'stock units\', \'weighted '
 "average grant date fair value'], ['balance at october 1', '1570329', '$ "
 "69.35'], ['granted', '618679', '62.96'], ['distributed', '-316839 ( 316839 "
 ")', '60.3

'The total fair value of the vested stocks of Abiomed Inc in 2010, 2011, and 2012 is $10.3 million, $11.6 million, and $20.9 million, respectively.'

In [None]:
#Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reading pdf files and converting them into Document objects
import pdfplumber
import re
from llama_index.core import Document

def pdf_to_sentence_documents(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        documents = [] # storing all documents in this list
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                # Split text into sentences using regex
                sentences = re.split(r'(?<=[.!?]) +', text) # each sentence is a single document
                for j, sentence in enumerate(sentences):
                    if sentence.strip():  # Ignore empty sentences
                        metadata = {   # metadata is the page number and the sentence number
                            "page_number": i + 1,
                            "sentence_number": j + 1,
                        }
                        documents.append(Document(text=sentence.strip(), metadata=metadata))
    return documents

# Example usage
pdf_path = '/content/drive/My Drive/finqa/AESL.pdf'  # Replace with your PDF path
sentence_documents = pdf_to_sentence_documents(pdf_path)

# Inspect the first few sentence documents
for doc in sentence_documents[:5]:
    print(doc.text, doc.metadata)


Energy Solutions
Cooling Solutions {'page_number': 1, 'sentence_number': 1}
“I have always been guided
by striving to show
the best that I could” {'page_number': 2, 'sentence_number': 1}
“I have always been guided
by striving to show
the best that I could” {'page_number': 3, 'sentence_number': 1}
Mountains symbolise trust
and truthfulness, strength
and stillness, constancy and
courage. {'page_number': 4, 'sentence_number': 1}
In this publication, we
have drawn our inspiration from
the mountains, and paid our
humble homage to the world's
highest mountains.
Mount Everest, the world's highest mountain {'page_number': 4, 'sentence_number': 2}


In [None]:
len(documents)

593

In [None]:
#initialising the client for Qdrant
clientpdf = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    location=":memory:"

)

In [None]:
#indexing. adding all of the document objects that have been embedded into the vector database
vector_store_pdf = QdrantVectorStore(client=client, collection_name="collection")
storage_context_pdf = StorageContext.from_defaults(vector_store=vector_store_pdf)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context_pdf,
    show_progress=True
)

Parsing nodes:   0%|          | 0/593 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/593 [00:00<?, ?it/s]

In [None]:
#using Groq platform to host the LLM
from llama_index.llms.groq import Groq
from pprint import pprint
llm = Groq(model="llama3-8b-8192", api_key="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") #initialising the LLM
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") #using the sentence-transformers embedding model
retriever = VectorIndexRetriever(index=index, embed_model=embed_model, similarity_top_k=5) #initialising the retriever

def run(query):
    nodes = retriever.retrieve(query)
    text_data = ""
    meta = str(nodes[0].metadata)
    for node in nodes:  #concatenating data for the llm to use
        text_data += node.text
        if str(node.metadata) not in meta:
            meta+=str(node.metadata)

    context = text_data + meta
    pprint(context) #printing context to see where the data is coming from

    return llm.complete(prompt.format(question=query, context=context)) #LLM will use the  prompt to answer the query using the context that has been concatenated with documents the retriever has fetched from the vector store




In [None]:
prompt = """
You are an expert financial assistant who answers questions about financial documents.
Given the question:
{question}

and the context:
{context}


answer the question comprehensively. If the context is not helpful in answering the question, say "I don't know", do not make up an answer.
It is very important to my career that you be truthful or I will lose my job.

Do not answer with "I am happy to answer..." or "Sure, I will...", be to the point and only return the answer.
"""

In [None]:
ans = run("What is the vision of the company")
ans.text

('demand showed some improvement toward the end of the year , bolstered by the '
 'introduction our new line of vision innovation paper products ( vip '
 'technologiestm ) , with improved brightness and white- ness .we periodically '
 'review our portfolio of brands and evaluate potential strategic transactions '
 'to increase shareholder value .{\'table\': "[[\'in millions\', \'2005\', '
 "'2004', '2003'], ['sales', '$ 7860', '$ 7670', '$ 7280'], ['operating "
 'profit\', \'$ 552\', \'$ 581\', \'$ 464\']]", \'filename\': '
 '\'IP/2005/page_27.pdf\'}{\'table\': "[[\'( in millions )\', \'2016\', '
 "'2015'], ['general and administrative expense', '$ -80.9 ( 80.9 )', '$ -70.1 "
 "( 70.1 )'], ['defined benefit plan income', '2.9', '6.1'], ['defined benefit "
 "plan recognition of actuarial losses', '-1.9 ( 1.9 )', '-2.5 ( 2.5 )'], "
 "['norcraft transaction costs ( a )', '2014', '-15.1 ( 15.1 )'], ['total "
 'corporate expenses\', \'$ -79.9 ( 79.9 )\', \'$ -81.6 ( 81.6 )\']]", '
 "'filena

"Based on the provided financial documents, the company's vision is not explicitly stated. However, it can be inferred that the company is focused on increasing shareholder value through strategic transactions and introducing new products, such as the Vision Innovation Paper Products (VIP Technologies) line, which showed improvement in demand towards the end of the year."