In [1]:
from pdf2image import convert_from_path
from dotenv import load_dotenv
load_dotenv()
import os
doc_path = os.getenv('pdf_coop')
pages = convert_from_path(doc_path)

In [2]:
import cv2
import numpy as np

def deskew(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    coords = np.column_stack(np.where(gray > 0))
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    return rotated

In [3]:
import pytesseract

def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

In [4]:
extracted_texts = []

for page in pages:
    # Step 2: Preprocess the image (deskew)
    preprocessed_image = deskew(np.array(page))

    # Step 3: Extract text using OCR
    text = extract_text_from_image(preprocessed_image)
    extracted_texts.append(text)

In [5]:
from pprint import pprint
pprint(extracted_texts[5])

('GAMME\n'
 '\n'
 'JUNIOR SAVINGS ACCOUNT\n'
 'PRODUCT OVERVIEW:\n'
 '\n'
 'This is specifically crafted for children aged\n'
 "0-14 years. It is established in the child's name\n"
 'but is typically managed by parents or guard-\n'
 'ians until the child reaches an age where\n'
 'he/she can independently oversee their\n'
 'finances.\n'
 '\n'
 'Key Features\n'
 '\n'
 'e\n'
 '\n'
 'The account is registered in the\n'
 "child's name, setting the founda-\n"
 'tion for their financial future.\n'
 'Bears an interest rate of 7.175%,\n'
 'fostering the growth of their sav-\n'
 'ings.\n'
 '\n'
 'Facilitates financial independence\n'
 'by enabling withdrawals when the\n'
 'junior accountholder reaches the\n'
 'age of youth.\n'
 '\n'
 'Children demonstrating indepen-\n'
 'dent income, who initiate their\n'
 'account, enjoy unrestricted access\n'
 'to their deposits.\n'
 '\n'
 'Receive a Birr 100 credit incentive\n'
 'when the average six-month\n'
 'deposit level reaches Birr 30,000\n'
 'and above

In [6]:
import re

def clean_text(text):
    # Step 1: Remove hyphenated line breaks (e.g., 'guard-\n ians' -> 'guardians')
    cleaned_text = re.sub(r'-\n\s*', '', text)
    
    # Step 2: Replace single newlines between lines with spaces
    cleaned_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', cleaned_text)
    match = re.search(r'PAGE (\d+)', text)
    
    # Step 3: Remove "PRODUCT CATALOG" and everything after
    cleaned_text = re.sub(r'PRODUCT CATALOG.*$', '', cleaned_text, flags=re.DOTALL).strip()

    # Step 4: Capture the page number and split the text at "PAGE xx"

    
    if match:
        # print("match startes at: ",match.start())
        # Split into content and page sections
        content = cleaned_text[:match.start()].strip()
        page = match.group()
        # match_obj.group()
    else:
        content = cleaned_text
        page = None  # No page section found
    
    return content
# cleaned = clean_text(extracted_text[5])
full_text = []
for extracted_text in extracted_texts:
    full_text.append(clean_text(extracted_text))   
full_text


['PRODUCT\n\nCATALOG',
 'CONTENTS\n\nT. INTFOCGUCTION ....ccccccccocccccsccccecccsccccccccsccccoccccccccocccccsccoccsccsccccceccescccccccccee |\n\nVD BaCkQroUind) .....csssssssssssssssssssssnssssssessssssssssceseceeeeeeeeeeeeessssssnssnsnnusnusnummsssssssssssseseeeeeeeeeeeseeseessseesea ] V.2 ODJOCTIVE woieeesssssccccecceeeeeeeeeessnsssssssnsnssmmusmmssssssssssssseceeceeeeeceeessessessssssnsnsuunnunmsessssssseseceeeeees ] VB MOtHOGOlOGY onecessssssesssssssessssnssssccsssssnnessssecesssssnssssesessssssunssssesssssssunusseesessssssusesseceesssssuueeseeeeesses 1\n\n2. Banking Products and Services ...cccccccccccccssscccsscccccccccccccccccccecccccccces 2\n\n2.1 CONVENTIONAL BANKING PrOCUCTS ou. eeeeecsseesssssssseessesssneseessnescessssecsssssusesesssneeeeessneeesessse 2 De POSsit PrOCUCTS wecsssssssssssssessesessesnsssssssssesssssseeeeceeeseeeeeeeeessiisitimmmunnsnssssssssssssesee 3 Local currency deposit products 3 Demand Deposit ACCOUNTS ...seesssssssesessssseeeeessunsssssseeesssnusssss

In [11]:
clean_text(extracted_texts[5])

"GAMME\n\nJUNIOR SAVINGS ACCOUNT PRODUCT OVERVIEW:\n\nThis is specifically crafted for children aged 0-14 years. It is established in the child's name but is typically managed by parents or guardians until the child reaches an age where he/she can independently oversee their finances.\n\nKey Features\n\ne\n\nThe account is registered in the child's name, setting the foundation for their financial future. Bears an interest rate of 7.175%, fostering the growth of their savings.\n\nFacilitates financial independence by enabling withdrawals when the junior accountholder reaches the age of youth.\n\nChildren demonstrating independent income, who initiate their account, enjoy unrestricted access to their deposits.\n\nReceive a Birr 100 credit incentive when the average six-month deposit level reaches Birr 30,000 and above.\n\nBENEFITS\n\nEstablish a financial nest egg for children as they approach adulthood.\n\nInstill a savings culture from an early age, empowering parents to teach valuable

In [22]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_nomic import NomicEmbeddings
import numpy as np

# Initialize the embedding and FAISS components
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
index = faiss.IndexFlatL2(len(embeddings.embed_query(" ")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),  # In-memory docstore
    index_to_docstore_id={}
)

docstore_id = 0  # Start with the first document ID

# Loop through the list of extracted texts
for extracted_text in extracted_texts:
    cleaned_content = clean_text(extracted_text)  # Clean the text
    
    if cleaned_content:  
        content_embedding = embeddings.embed_query(cleaned_content)
        content_embedding_np = np.array([content_embedding], dtype=np.float32)
        
        vector_store.index.add(content_embedding_np)  
        vector_store.docstore._dict[docstore_id] = cleaned_content
        
        vector_store.index_to_docstore_id[docstore_id] = docstore_id
 
        docstore_id += 1


In [25]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
url = os.environ.get("base_url")

retriever = vector_store.as_retriever()
llm = ChatOpenAI(
                model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
                base_url=url,
                api_key="lm-studio"
            )
# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
answer= rag_chain.invoke({"input": "what is gamme?"})
answer['answer']

ValueError: Could not find document for id 5, got GAMME

JUNIOR SAVINGS ACCOUNT PRODUCT OVERVIEW:

This is specifically crafted for children aged 0-14 years. It is established in the child's name but is typically managed by parents or guardians until the child reaches an age where he/she can independently oversee their finances.

Key Features

e

The account is registered in the child's name, setting the foundation for their financial future. Bears an interest rate of 7.175%, fostering the growth of their savings.

Facilitates financial independence by enabling withdrawals when the junior accountholder reaches the age of youth.

Children demonstrating independent income, who initiate their account, enjoy unrestricted access to their deposits.

Receive a Birr 100 credit incentive when the average six-month deposit level reaches Birr 30,000 and above.

BENEFITS

Establish a financial nest egg for children as they approach adulthood.

Instill a savings culture from an early age, empowering parents to teach valuable financial lessons. Convenience of fee-free transactions, making banking interactions hassle-free.

Unlock additional incentives provided by the bank, subject to specified criteria.

ELIGIBLE CANDIDATES:

This account caters to newborns, infants, children under the age of fifteen, and those who engage in work up to the age of fifteen.

ay

TARGET CUSTOMERS

& Newborn at Hospital

& Student at school and Kindergarten

& NGOs that work with kids

CROSS SELLING eto PRODUCTS

WW * Gudunfa Gamme Saving account.

* When they are of age, Dargago saving account.

REMARKS

It is a unique opportunity to pave the way for a child's financial success.

ValueError: Could not find document for id 5, got GAMME

JUNIOR SAVINGS ACCOUNT PRODUCT OVERVIEW:

This is specifically crafted for children aged 0-14 years. It is established in the child's name but is typically managed by parents or guardians until the child reaches an age where he/she can independently oversee their finances.

Key Features

e

The account is registered in the child's name, setting the foundation for their financial future. Bears an interest rate of 7.175%, fostering the growth of their savings.

Facilitates financial independence by enabling withdrawals when the junior accountholder reaches the age of youth.

Children demonstrating independent income, who initiate their account, enjoy unrestricted access to their deposits.

Receive a Birr 100 credit incentive when the average six-month deposit level reaches Birr 30,000 and above.

BENEFITS

Establish a financial nest egg for children as they approach adulthood.

Instill a savings culture from an early age, empowering parents to teach valuable financial lessons. Convenience of fee-free transactions, making banking interactions hassle-free.

Unlock additional incentives provided by the bank, subject to specified criteria.

ELIGIBLE CANDIDATES:

This account caters to newborns, infants, children under the age of fifteen, and those who engage in work up to the age of fifteen.

ay

TARGET CUSTOMERS

& Newborn at Hospital

& Student at school and Kindergarten

& NGOs that work with kids

CROSS SELLING eto PRODUCTS

WW * Gudunfa Gamme Saving account.

* When they are of age, Dargago saving account.

REMARKS

It is a unique opportunity to pave the way for a child's financial success.

In [50]:
import re

def find_page_match(text):
    # Define the regular expression pattern to find "PAGE xx"
    page_match = re.search(r'PAGE \d+', text)
    
    if page_match:
        print(page_match)  # This will output the match object
        return page_match
    else:
        print("No page number found")
        return None

# cleaned_text = re.sub(r'PRODUCT CATALOG.*$', '', cleaned_text, flags=re.DOTALL).strip()

# Find the page match
match_obj = find_page_match(extracted_text[5])

# If needed, access properties of the match object
if match_obj:
    print("Span:", match_obj.span())  # Output: (1663, 1670)
    print("Start:", match_obj.start())
    print("Matched text:", match_obj.group())  # Output: 'PAGE 04'

No page number found
