## All imports

In [1]:
from langchain_core.documents import Document
from langchain_community.llms import Ollama
from langchain_text_splitters import RecursiveCharacterTextSplitter     #
# from langchain.document_loaders.pdf import PyPDFDirectoryLoader         # For loading pdf from directories
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_core.document_loaders import Blob
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS      ## I am using this instead of Chroma because I only have FAISS installed currently
from langchain.prompts import ChatPromptTemplate

import pypdf
from pypdf import PdfReader
from typing import List


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


## Just to test if the model is properly working or not

In [25]:
llm = Ollama(model="llama3.1")
llm.invoke("Apple launched its first iphone in the year")

"2007. The iPhone was announced by Steve Jobs, Apple's co-founder and CEO at the time, on January 9, 2007, and it went on sale to the public on June 29, 2007."

## Part - 1

### 1. Loading the documents and splitting them

In [2]:
def load_documents(data_path: str):
    """This Document loader will load the pdf documents and return the text part of the documents"""
    document_loader = PyPDFDirectoryLoader(data_path) # There is an option to "extract_images" from pdf and then process them 
    return document_loader.load()


def split_documents(documents: List[Document]):
    """ The text splitter will split the documents such that there exists some over lap between the split docs,
    chunks are in Document format Document(metadata={'source': ..., 'page': ...}, page_content= ' ')"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


In [3]:
documents = load_documents("")
chunks = split_documents(documents)

In [22]:
documents[0].page_content

' \n \n \n \nRef No: KIVL/BSE/SEC/ 786  \n \n                                                                                                                                                  September  19, 202 4  \n \nDepartment of Corporate Services  \nBSE Limited  \nFloor 25, PJ Towers Dalal Street,  \nMumbai - 400001  \n \n \nDear Sirs,  \n \nSub: Correction of inadvertent  error in the Book Closure Intimation in the Board Meeting Outcome of \nthe Company dated 2nd September 2024  \n \nRef: Intimation dtd.02.09.2024 (Ref No: KIVL/BSE/SEC/780) about Book Closure Intimation in the \nBoard Meeting Outcome of the Company dated 2nd September 2024  \n \nWith reference to the above captioned subject please note that there was an inadvertent error in the \nBook Closure Intimation .  We request you to  note the dates of book closure  are from ( 24th September \n2024  to 29th September 2024 )   instead of   (22nd September 2024 to 29th September 2024 ) as \nmistakenly mentioned in the  Board 

In [4]:
chunks

[Document(metadata={'source': '787af145-da18-4133-bb5b-5f12a382f7c5.pdf', 'page': 0}, page_content='Ref No: KIVL/BSE/SEC/ 786  \n \n                                                                                                                                                  September  19, 202 4  \n \nDepartment of Corporate Services  \nBSE Limited  \nFloor 25, PJ Towers Dalal Street,  \nMumbai - 400001  \n \n \nDear Sirs,  \n \nSub: Correction of inadvertent  error in the Book Closure Intimation in the Board Meeting Outcome of \nthe Company dated 2nd September 2024  \n \nRef: Intimation dtd.02.09.2024 (Ref No: KIVL/BSE/SEC/780) about Book Closure Intimation in the \nBoard Meeting Outcome of the Company dated 2nd September 2024  \n \nWith reference to the above captioned subject please note that there was an inadvertent error in the \nBook Closure Intimation .  We request you to  note the dates of book closure  are from ( 24th September \n2024  to 29th September 2024 )   instead of  

### 2. Embeddings

### 2.1 Choosing an embeddings model and overall llm model

In [10]:
embedding_function = OllamaEmbeddings(
    model="llama3.1",
)
llm = Ollama(model="llama3.1")

### 2.2 Creation of embeddings

In [11]:
embeddings = embedding_function.embed_documents([chunk.page_content for chunk in chunks])

### 3. Selecting suitable embedding vector database

In [9]:
vectordb = FAISS.from_documents(documents=chunks,
                      embedding=embedding_function)
# vectordb.save_local("vectorstore_fr")

ResponseError: 404 page not found

In [12]:
vectordb = FAISS.load_local(folder_path="vectorstore_fr", embeddings=embedding_function, allow_dangerous_deserialization=True)
# FAISS.load_local()
# FAISS.load_local()

In [16]:
vectordb = FAISS.from_embeddings(text_embeddings=zip([chunk.page_content for chunk in chunks], embeddings), embedding=embedding_function)

In [13]:
res = vectordb.similarity_search("kings infra Ventures", k=4)
res

[Document(metadata={'source': '100fee26-23bb-4ca6-873d-19ea3960542f.pdf', 'page': 1}, page_content='Regd. Office: B-29, EEIE Stage II, Balanagar, \nHyderabad - 500 037, Telangana, INDIA  \nPhone: +91-40-23079310,11,12,13, Email: info@lokeshmachines.com \nWebsite: www.lokeshmachines.com, CIN: L29219TG1983PLC004319  \n \n \n \n \nUnits:  Balanagar, Bonthapally, Medcha l, Toopran, Ranjangaon-Pune. \nAnnexure-A \n \nDisclosures under Regulation 30 and Schedule III of  the SEBI Listing Regulations read with SEBI \nCircular No. SEBI/HO/CFD/CFD-PoD-1/P/CIR/2023/123 dated July 13, 2023  \n \nS. No. Particulars Details \na)  Name of the regulatory or licensing \nauthority Department for Promotion of Industry and Internal \nTrade, New Delhi. \nb)  Brief details of the approval/license \nobtained/ withdrawn/ surrendered Licence in Form VII for the manufacture of Fixed / \nTowed Heavy Machine Guns of caliber above 12.7mm and up to 30mm.\n \nc)  Impact/relevance of such \napproval/license to the li

In [34]:
chunks

[Document(metadata={'source': '787af145-da18-4133-bb5b-5f12a382f7c5.pdf', 'page': 0}, page_content='Ref No: KIVL/BSE/SEC/ 786  \n \n                                                                                                                                                  September  19, 202 4  \n \nDepartment of Corporate Services  \nBSE Limited  \nFloor 25, PJ Towers Dalal Street,  \nMumbai - 400001  \n \n \nDear Sirs,  \n \nSub: Correction of inadvertent  error in the Book Closure Intimation in the Board Meeting Outcome of \nthe Company dated 2nd September 2024  \n \nRef: Intimation dtd.02.09.2024 (Ref No: KIVL/BSE/SEC/780) about Book Closure Intimation in the \nBoard Meeting Outcome of the Company dated 2nd September 2024  \n \nWith reference to the above captioned subject please note that there was an inadvertent error in the \nBook Closure Intimation .  We request you to  note the dates of book closure  are from ( 24th September \n2024  to 29th September 2024 )   instead of  

### Retrieval

In [30]:
res

[Document(metadata={}, page_content='Regd. Office: B-29, EEIE Stage II, Balanagar, \nHyderabad - 500 037, Telangana, INDIA  \nPhone: +91-40-23079310,11,12,13, Email: info@lokeshmachines.com \nWebsite: www.lokeshmachines.com, CIN: L29219TG1983PLC004319  \n \n \n \n \nUnits:  Balanagar, Bonthapally, Medcha l, Toopran, Ranjangaon-Pune. \nSeptember 19, 2024 \n  \nTo, \nBSE Limited Department of Corporate Services Floor 25, PJ Towers, Dalal Street Mumbai- 400001   Scrip Code: 532740  To, \nNational Stock Exchan ge of India Limited \nListing Department Plot No. C/1, G Block, Exchange Plaza, Bandra Kurla Complex, Bandra(E), Mumbai- 400051  Company Code: LOKESHMACH  \n Dear Sir/Madam,  Sub: Disclosure pursuant to Regulation 30 of  SEBI (Listing Obligations and Disclosure \nRequirements) Regulations, 2015  In compliance with Regulation 30 of the SEBI (L isting Obligations and Disclosure Requirements) \nRegulations, 2015 (“SEBI Listing Re gulations”), we hereby inform you that an intimation has 

In [15]:
template = """You are a financial advisor and have knowledge to read and understand financial announcements. So answer the question based only on the following context:
            {context}
            Question:{question}"""
prompt = ChatPromptTemplate.from_template(template=template)
prompt
            

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a financial advisor and have knowledge to read and understand financial announcements. So answer the question based only on the following context:\n            {context}\n            Question:{question}'), additional_kwargs={})])

In [23]:
prompt = template.replace("{context}", documents[0].page_content ).replace("{question}", "what is the date of book closure for kings infra")
prompt

'You are a financial advisor and have knowledge to read and understand financial announcements. So answer the question based only on the following context:\n             \n \n \n \nRef No: KIVL/BSE/SEC/ 786  \n \n                                                                                                                                                  September  19, 202 4  \n \nDepartment of Corporate Services  \nBSE Limited  \nFloor 25, PJ Towers Dalal Street,  \nMumbai - 400001  \n \n \nDear Sirs,  \n \nSub: Correction of inadvertent  error in the Book Closure Intimation in the Board Meeting Outcome of \nthe Company dated 2nd September 2024  \n \nRef: Intimation dtd.02.09.2024 (Ref No: KIVL/BSE/SEC/780) about Book Closure Intimation in the \nBoard Meeting Outcome of the Company dated 2nd September 2024  \n \nWith reference to the above captioned subject please note that there was an inadvertent error in the \nBook Closure Intimation .  We request you to  note the dates of book c

In [27]:
llm.invoke(prompt)

'Based on the context provided, the correct dates of book closure for Kings Infra Ventures Limited are:\n\n**24th September 2024 to 29th September 2024**\n\nThese dates supersede the previously mentioned incorrect dates (22nd September 2024 to 29th September 2024) in the Board Meeting outcome dated 2nd September 2024.'

## Reading a result report

In [11]:
from unstructured.partition.pdf import partition_pdf

ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (/Users/balakrishnareddy/anaconda3/envs/llm_v1/lib/python3.12/site-packages/pdfminer/utils.py)

In [9]:
PROJECT_ID = "YOUR_PROJECT_ID"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# For Vector Search Staging
GCS_BUCKET = "YOUR_BUCKET_NAME"  # @param {type:"string"}
GCS_BUCKET_URI = f"gs://{GCS_BUCKET}"

### Using Pymupdf for reading table data from pdf.

In [28]:
data_url = "https://www.bseindia.com/xml-data/corpfiling/AttachLive/bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf"

In [36]:
import requests
import pymupdf 

In [32]:

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

response = requests.get(data_url, headers=headers)

In [33]:
response.status_code

200

In [40]:
with open("bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf", 'wb') as f: f.write(response.content)

In [41]:
doc = pymupdf.open("bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf")

In [51]:
for i in doc:
    tabs = i.find_tables()
    print(f"{len(tabs.tables)} table(s) on {i}")
    if len(tabs.tables) > 0:
        print([type(j) for j in tabs])
    # if len(tabs.tables) > 0:
    #     for tab in tabs:
    #         display(tab.to_pandas())


0 table(s) on page 0 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
3 table(s) on page 1 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>, <class 'pymupdf.table.Table'>, <class 'pymupdf.table.Table'>]
1 table(s) on page 2 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>]
0 table(s) on page 3 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
2 table(s) on page 4 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>, <class 'pymupdf.table.Table'>]
1 table(s) on page 5 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>]
1 table(s) on page 6 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>]
1 table(s) on page 7 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>]
1 table(s) on page 8 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
[<class 'pymupdf.table.Table'>]
0 table(s) on page 9 of bba9ac32-1e95-4c4a-a1fb-7ca9742c180e.pdf
1 table(s) on page 10 of bba9ac32

In [53]:
table_data = doc[6].find_tables()

In [62]:
table_data.tables[0].to_pandas()

Unnamed: 0,"TATA CONSULTANCY SERVICES LIMITED\nRegistered Office: 9th Floor, Nirmal Building, Nariman Point, Mumbai 400 021\nCIN: L22210MH1995PLC084781\nTel: +91 22 6778 9595 e-mail: investor.relations@tcs.com Website: www.tcs.com\nAudited Consolidated Interim Statement of Financial Results\n( crore)",Col1,Col2,Col3,Col4,Col5,Col6
0,,Three months ended,,,Six months ended,,Year ended
1,,"September 30,\n2024","June 30,\n2024","September 30,\n2023","September 30,\n2024","September 30,\n2023","March 31,\n2024"
2,Revenue from operations\nOther income\nTOTAL I...,"64,259\n729","62,613\n962","59,692\n1,006","1,26,872\n1,691","1,19,073\n2,403","2,40,893\n4,422"
3,,64988,63575,60698,128563,121476,245315
4,,"36,654\n3,230\n162\n1,266\n7,644","36,416\n2,151\n173\n1,220\n7,384","35,123\n462\n159\n1,263\n8,361","73,070\n5,381\n335\n2,486\n15,028","70,271\n968\n322\n2,506\n17,090","1,40,131\n3,702\n778\n4,985\n32,764"
5,,48956,47344,45368,96300,91157,182360
6,,"16,032\n-","16,231\n-","15,330\n-","32,263\n-","30,319\n-","62,955\n958"
7,,"16,032\n4,078\n(1)","16,231\n4,290\n(164)","15,330\n3,955\n(5)","32,263\n8,368\n(165)","30,319\n7,823\n(4)","61,997\n15,864\n34"
8,,4077,4126,3950,8203,7819,15898
9,,11955,12105,11380,24060,22500,46099


## Scratch

In [5]:
chunks

[Document(metadata={'source': '787af145-da18-4133-bb5b-5f12a382f7c5.pdf', 'page': 0}, page_content='Ref No: KIVL/BSE/SEC/ 786  \n \n                                                                                                                                                  September  19, 202 4  \n \nDepartment of Corporate Services  \nBSE Limited  \nFloor 25, PJ Towers Dalal Street,  \nMumbai - 400001  \n \n \nDear Sirs,  \n \nSub: Correction of inadvertent  error in the Book Closure Intimation in the Board Meeting Outcome of \nthe Company dated 2nd September 2024  \n \nRef: Intimation dtd.02.09.2024 (Ref No: KIVL/BSE/SEC/780) about Book Closure Intimation in the \nBoard Meeting Outcome of the Company dated 2nd September 2024  \n \nWith reference to the above captioned subject please note that there was an inadvertent error in the \nBook Closure Intimation .  We request you to  note the dates of book closure  are from ( 24th September \n2024  to 29th September 2024 )   instead of  