In [1]:
!pip -q install langdetect easyocr pytesseract
!sudo apt -q install tesseract-ocr tesseract-ocr-ara
!pip -q install langchain_google_genai

Reading package lists...
Building dependency tree...
Reading state information...
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-ara is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
import numpy as np
import cv2
from pytesseract import pytesseract
import easyocr
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageFilter
import time
import os

# Defining paths to tesseract.exe
pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

class ExtractTextOCR:
    def __init__(self):
        # config settings
        self.config=r'--oem 3 --psm 6'

        # Language dictionaries for pytesseract and easyocr
        self.languages = {
            'english': ('eng', 'en'),
            'french': ('fra', 'fr'),
            'spanish': ('spa', 'es'),
            'arabic': ('ara', 'ar'),
            'german': ('deu', 'de'),
            'italian': ('ita', 'it'),
        }

    def prepare(self, image, show=False):
        """ Preprocess the image to improve OCR accuracy """
        # Read the image in grayscale
        img = None

        # Handle if image is a file path
        if isinstance(image, str):
            img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)

        # Handle if image is an OpenCV matrix (np.ndarray)
        elif isinstance(image, np.ndarray):
            img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Handle if image is a PIL image
        elif isinstance(image, Image.Image):
            img = np.array(image.convert('L'))  # Convert to grayscale using PIL

        # If the image is not provided or invalid, raise an error
        if img is None:
            raise ValueError("Invalid image input. Please provide an image file path, OpenCV image matrix, or PIL image.")

        # # Apply image preprocessing techniques (Bad Choice)
        img = cv2.resize(img, (1440, 2560), interpolation=cv2.INTER_AREA)
        enhancer = ImageEnhance.Sharpness(Image.fromarray(img))
        img = np.array(enhancer.enhance(1.5))  # Increase sharpness

        # Show the enhanced image
        if show:
            # cv2.imwrite("enhanced_image.jpg", img)
            plt.imshow(img, cmap='gray')
            plt.title('Enhanced Image')
            plt.axis('off')
            plt.show()
        return img

    def run(self, image_path, language, tesseract=True, show_img=False, verbose=0):
        # Set the languages based on the user input
        if language.lower() in self.languages:
            tesseract_lang, easyocr_lang = self.languages[language.lower()]
        else:
            raise ValueError(f"Language '{language}' is not supported. Please choose from {list(self.languages)}.")

        # Preprocess the image
        preprocessed_image = self.prepare(image_path, show=show_img)
        if tesseract:
            text = pytesseract.image_to_string(preprocessed_image, lang=f'eng+{tesseract_lang}', config=self.config)
        else:
            reader = easyocr.Reader(['en', easyocr_lang], gpu=True)
            text = " ".join(reader.readtext(preprocessed_image, detail=0, paragraph=True))

        if verbose:
            print(f"Extracted Text ({'tesseract' if tesseract else 'easyocr'}):", text)
        return text

    def test(self):
        pass

In [3]:
if __name__ == "__main__":

    text_model = ExtractTextOCR()
    language = 'arabic'
    text = text_model.run(r"/content/img.png",language=language, tesseract=True)
    print(text)

#1024

BILLED TO: Really Great Company
PAY TO: Avery Davis

123 Anywhere St., Any City

123-456-7890
Bank Really Great Bank
Account Name John Smith
BSB 000-000
\ccount Number OOOO OOOO
DESCRIPTION RATE HOURS AMOUNT
Content Plan S50, hr | $200.00
Copy Writing $50/hr 2 S1O0.00
Website Design 850) hr 5 $250.00
Website Development $100/ht 3 $300,00
SEO $50, hr | $200.00
Sub-Total $1,250.00
Package Discount (30%) $375.00
TOTAL $875.00
Payment is required within 4 business days of invoice date, Please send
remittance to hello@reallygreatsite.com
Thank vou for your business.



In [6]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from google.colab import userdata
from langchain_core.prompts import PromptTemplate
from langchain.schema import StrOutputParser, AIMessage

google_api_key = userdata.get("GOOGLE_API_KEY_1")

# Define the prompt template
prompt = PromptTemplate(
    input_variables=['input'],
    template="""
    You are an assistant. Your task is to take {input} and extract Description, RATE, HOURES, and Amount then put this in csv format.
    """
)

parameters ={'temperature': 0}
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",
                                  generation_config=parameters, api_key=google_api_key)



chain = prompt | llm | StrOutputParser()


response = chain.invoke({'input': text})

In [7]:
import csv

data = response
lines = data.strip().split('\n')
data_rows = [line.split(',') for line in lines]

# Saving the data into a CSV file
csv_file_path = 'output_data.csv'
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data_rows)

print(f"Data saved to {csv_file_path} successfully.")

Data saved to output_data.csv successfully.
