In [43]:
from pdf2image import convert_from_path
from dotenv import load_dotenv
load_dotenv()
import os
doc_path = os.getenv('pdf_coop')
pages = convert_from_path(doc_path)

In [44]:
import cv2
import numpy as np

def deskew(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    coords = np.column_stack(np.where(gray > 0))
    angle = cv2.minAreaRect(coords)[-1]
    
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    return rotated

In [45]:
import pytesseract

def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

In [46]:
extracted_texts = []

for page in pages:
    # Step 2: Preprocess the image (deskew)
    preprocessed_image = deskew(np.array(page))

    # Step 3: Extract text using OCR
    text = extract_text_from_image(preprocessed_image)
    extracted_texts.append(text)

In [47]:
from pprint import pprint
pprint(extracted_texts[5])

('GAMME\n'
 '\n'
 'JUNIOR SAVINGS ACCOUNT\n'
 'PRODUCT OVERVIEW:\n'
 '\n'
 'This is specifically crafted for children aged\n'
 "0-14 years. It is established in the child's name\n"
 'but is typically managed by parents or guard-\n'
 'ians until the child reaches an age where\n'
 'he/she can independently oversee their\n'
 'finances.\n'
 '\n'
 'Key Features\n'
 '\n'
 'e\n'
 '\n'
 'The account is registered in the\n'
 "child's name, setting the founda-\n"
 'tion for their financial future.\n'
 'Bears an interest rate of 7.175%,\n'
 'fostering the growth of their sav-\n'
 'ings.\n'
 '\n'
 'Facilitates financial independence\n'
 'by enabling withdrawals when the\n'
 'junior accountholder reaches the\n'
 'age of youth.\n'
 '\n'
 'Children demonstrating indepen-\n'
 'dent income, who initiate their\n'
 'account, enjoy unrestricted access\n'
 'to their deposits.\n'
 '\n'
 'Receive a Birr 100 credit incentive\n'
 'when the average six-month\n'
 'deposit level reaches Birr 30,000\n'
 'and above

In [48]:
import re

def clean_text(text):
    # Step 1: Remove hyphenated line breaks (e.g., 'guard-\n ians' -> 'guardians')
    cleaned_text = re.sub(r'-\n\s*', '', text)
    
    # Step 2: Replace single newlines between lines with spaces
    cleaned_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', cleaned_text)
    match = re.search(r'PAGE (\d+)', text)
    
    # Step 3: Remove "PRODUCT CATALOG" and everything after
    cleaned_text = re.sub(r'PRODUCT CATALOG.*$', '', cleaned_text, flags=re.DOTALL).strip()

    # Step 4: Capture the page number and split the text at "PAGE xx"

    
    if match:
        # print("match startes at: ",match.start())
        # Split into content and page sections
        content = cleaned_text[:match.start()].strip()
        page = match.group()
        # match_obj.group()
    else:
        content = cleaned_text
        page = None  # No page section found
    
    return content
# cleaned = clean_text(extracted_text[5])
full_text = []
for extracted_text in extracted_texts:
    full_text.append(clean_text(extracted_text))   

    # print(key)

In [49]:
# Example dictionary
example_dict = {
    'content': "This is the content.",
    'page': '04'
}

# Loop through the dictionary to get keys and values
for key, value in example_dict.items():
    print(f"Key: {key}, Value: {value}")


Key: content, Value: This is the content.
Key: page, Value: 04


In [50]:
import re

def find_page_match(text):
    # Define the regular expression pattern to find "PAGE xx"
    page_match = re.search(r'PAGE \d+', text)
    
    if page_match:
        print(page_match)  # This will output the match object
        return page_match
    else:
        print("No page number found")
        return None

# cleaned_text = re.sub(r'PRODUCT CATALOG.*$', '', cleaned_text, flags=re.DOTALL).strip()

# Find the page match
match_obj = find_page_match(extracted_text[5])

# If needed, access properties of the match object
if match_obj:
    print("Span:", match_obj.span())  # Output: (1663, 1670)
    print("Start:", match_obj.start())
    print("Matched text:", match_obj.group())  # Output: 'PAGE 04'


No page number found


In [51]:

# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_text(extracted_text)
# splits[0]