In [25]:

import pytesseract
from PIL import Image, ImageEnhance
import pandas as pd
import re
import os
import cv2

# Set path to Tesseract
pytesseract.pytesseract.tesseract_cmd = "C:/Users/przem/anaconda3/Lib/site-packages/pytesseract/tesseract.exe"


In [26]:

def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return Image.fromarray(image)

def extract_text_from_image(image_path):
    image = preprocess_image(image_path)
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, lang='pol', config=custom_config)
    lines = text.split('\n')
    if "Paragon fiskalny" in text:
        start_index = lines.index(next(line for line in lines if "Paragon fiskalny" in line))
        lines = lines[start_index + 1:]  # Extract everything that is under "Paragon fiskalny"
    return lines


In [27]:
def extract_amount(line):
    # Search for value after 'x'
    match = re.search(r'\bx\s*([\d,]+)', line)
    if match:
        kwota = match.group(1).replace(',', '.')
    else:
        # Search for value by number '1' or '|' if 'x' is not present
        match = re.search(r'\b1\s*([\d,]+)', line)
        if not match:
            match = re.search(r'\|\s*([\d,]+)', line)
        kwota = match.group(1).replace(',', '.') if match else ''
    
    # Add a comma after two digits from the right if no comma is found
    if kwota and '.' not in kwota:
        kwota = kwota[:-2] + '.' + kwota[-2:]
    
    return kwota


In [28]:
def extract_detail(line):
    # Search for a string from the beginning to the first encountered digit
    match = re.search(r'^\D+', line)
    if match:
        szczegol = match.group(0).strip()
    else:
        szczegol = ''
    return szczegol


In [29]:
def extract_quantity(line):
    digit_found = False
    space_found = False
    quantity = ""
    for char in line:
        if char.isdigit() and not digit_found:
            digit_found = True
        elif digit_found and not space_found:
            if char.isspace():
                space_found = True
        elif space_found:
            if char == 'x':
                break
            quantity += char
    quantity = quantity.replace('i', '1').replace('||', '1').replace('|', '1').strip()
    return quantity if quantity else '1'  # If nothing found, default to 1


In [30]:
def process_receipts_folder(folder_path):
    all_Data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(folder_path, filename)
            lines = extract_text_from_image(image_path)
            # Separation of store name and date
            parts = filename.replace('.jpg', '').replace('.png', '').rsplit(' ', 1)
            firma = parts[0].strip()  # Extracting store name from file name
            data = parts[1].replace('_IMG', '').strip()  # Extracting date from filename
            for line in lines:
                kwota = extract_amount(line)
                szczegol = extract_detail(line)
                ilosc = extract_quantity(line)
                all_Data.append({
                    'Date': data,  
                    'Ammount': kwota, 
                    'Type': 'expenditure', 
                    'Category': '', 
                    'Constant/Variable': 'variable', 
                    'Detail': szczegol,
                    'Company': firma, 
                    'Quantity': ilosc,
                    'Full line': line.strip()
                })

    df = pd.DataFrame(all_Data)
    return df


folder_path = 'C:/Users/przem/Desktop/bilans/Paragony'
df = process_receipts_folder(folder_path)


columns_order = ['Date', 'Ammount', 'Type', 'Category', 'Constant/Variable', 'Detail', 'Company', 'Quantity', 'Full line']
df = df[columns_order]

# Filtering DataFrame
df_filtered = df[(df['Ammount'].apply(lambda x: len(str(x)) >= 4 and x != '')) & (df['Detail'] != 'BTIU') & (df['Detail'] != 'SPRZEDAŻ OPODATK. B')]

# Index reset
df_filtered.reset_index(drop=True, inplace=True)

# Copying filtered DataFrame to clipboard without headers and indexes
df_filtered.to_clipboard(index=False, header=False)

# Displaying the first 50 rows of a filtered DataFrame
df_filtered.head(50)


Unnamed: 0,Date,Ammount,Type,Category,Constant/Variable,Detail,Company,Quantity,Full line
0,20241206,164.84,expenditure,,variable,RĘCZKUCH. :,auchan,"1 28,99 — 8,998","RĘCZKUCH. : 116484 1 28,99 — 8,998"
1,20241206,4.27,expenditure,,variable,VEBAN KOSTK,auchan,1,"VEBAN KOSTK 318059 1 x4,27 4,278"
2,20241206,1.68,expenditure,,variable,SQH BUDY —,auchan,1,"SQH BUDY — 186633 1 x1,68 1,68"
3,20241206,5.58,expenditure,,variable,,auchan,AUCHAN : 275682 ],"505 AUCHAN : 275682 ] x5,58 5,588"
4,20241206,3.08,expenditure,,variable,JOGURT SKY,auchan,1,"JOGURT SKY 657987 1 x3,08 3,08€"
5,20241206,3.08,expenditure,,variable,JORURT SKY,auchan,1,"JORURT SKY 657987 1 x3,08 3,08C ."
6,20241206,1.68,expenditure,,variable,BUDY — DR,auchan,220190 1,"BUDY — DR 0 220190 1 x1,68. 1, 680"
7,20241206,1.68,expenditure,,variable,SŁ .CHWILA,auchan,1,"SŁ .CHWILA 220199€ 1 x1,68 1,6"
8,20241206,29.99,expenditure,,variable,OMIDOR DA,auchan,"1 29,99 9,","OMIDOR DA 693069 1 29,99 9,"
9,20241206,7.04,expenditure,,variable,DEVELEY,auchan,1,"DEVELEY 987854 1 x7,04 i ,04B"
