## Import modules

In [1]:
import json
import os
import re

import numpy as np
import pandas as pd

In [2]:
os.chdir('./..')
os.getcwd()

'C:\\Users\\Michal\\Documents\\GitHub\\OCR-Receipts'

In [3]:
from scripts.content_detector import process_image
from scripts.content_detector import process_raw_content

## Load raw content

In [14]:
content_filepath = 'results/Paragon_2022-08-11_081131_300dpi/raw_content.txt'
ref_content_filepath = 'notebooks/Paragon_text.txt'

In [15]:
with open(content_filepath, encoding='utf-8') as f:
    raw_content = f.read()

with open(ref_content_filepath, encoding='utf-8') as f:
    ref_content = f.read()

In [16]:
# Compare with reference
for line_ref, line_raw in zip(ref_content.split('\n'), raw_content.split('\n')):
    print(f'{line_ref:<40} | {line_raw:<40}')

BIEDRONKA “CODZIENNIE NISKIE CENY” 4884  | BIEDRONKA “CODZIENNIE NESKIE CENY” 4884 
02-690 WARSZAWA UL. BOKSERSKA 61         | 82-690 WARSZAWA UL. BOKSERSKA 64        
JERONIMO MARTINS POLSKA S.A.             | JERONIMO MARTINS POLSKA 5.A.            
62-025 KOSTRZYN UL.ŻNIWNA 5              | 62-025 KOSTRZYN UL. ZNIMNA 5            
                                         |                                         
NIP 7791011327 nr:185679                 | NIP 7794611327 nr 185679                
PARAGON FISKALNY                         | PARAGON FISKALNY                        
PassataSottGus700g D 1 x3,99 3,99D       | PassataSot tGus/08q b { «3,99 3,990     
Sos Madero 250ml B 1 x3,49 3,49B         | Sos Madera 250nt 5 1 «3,49 3,498        
PłNabłyszczFinish400 A 1 x13,99 13,99A   | PtNabtyszczFinishd00 A { x13,99 13, 99A 
Sól Kraft 2kg A 1 x5,95 5,95A            | Sol Kraft 2kg A { ¥5,95 5,954           
SprayPronto 250ml A 1 x8,99 8,99A        | SprayProntoZ50al A {1 ¥8,99 8,99A

In [17]:
# Get the main body of the receipt for further processing
raw_content_all, ref_content_all = raw_content, ref_content
raw_content, ref_content = raw_content.split('PARAGON FISKALNY\n')[1], ref_content.split('PARAGON FISKALNY\n')[1]

## Process content

### Get shop name

In [18]:
shopname = process_content.get_shop_name(raw_content_all)
shopname

'Biedronka'

### Get total sum

In [19]:
total_sum = process_content.get_total_sum(raw_content)
total_sum

274.28

### Replace wrong characters

Often `1` is incorrectly recognized as `(` and `{`.

In [20]:
content = raw_content.replace('(', '1')
content = content.replace('{', '1')

Often `x` is incorrectly recognized as `«` and `¥`.

In [21]:
content = content.replace('«', 'x')
content = content.replace('¥', 'x')

Often `~` is incorrectly recognized as `-`.

In [22]:
content = content.replace('~', '-')

Often `P` is incorrectly recognized as `?`.

In [23]:
content = content.replace('?', 'P')

In [24]:
for line_ref, line_raw, line in zip(ref_content.split('\n'), raw_content.split('\n'), content.split('\n')):
    print(f'{line_ref:<40}|{line_raw:<40}|{line:<40}')

PassataSottGus700g D 1 x3,99 3,99D      |PassataSot tGus/08q b { «3,99 3,990     |PassataSot tGus/08q b 1 x3,99 3,990     
Sos Madero 250ml B 1 x3,49 3,49B        |Sos Madera 250nt 5 1 «3,49 3,498        |Sos Madera 250nt 5 1 x3,49 3,498        
PłNabłyszczFinish400 A 1 x13,99 13,99A  |PtNabtyszczFinishd00 A { x13,99 13, 99A |PtNabtyszczFinishd00 A 1 x13,99 13, 99A 
Sól Kraft 2kg A 1 x5,95 5,95A           |Sol Kraft 2kg A { ¥5,95 5,954           |Sol Kraft 2kg A 1 x5,95 5,954           
SprayPronto 250ml A 1 x8,99 8,99A       |SprayProntoZ50al A {1 ¥8,99 8,99A       |SprayProntoZ50al A 11 x8,99 8,99A       
PiwoMahou0,33l A 2 x4,99 9,98A          |PiwoHahoud , 331 R 2 x4,99 9,58A        |PiwoHahoud , 331 R 2 x4,99 9,58A        
Napój Cola Zero 1l A 2 x4,99 9,98A      |Napoj Cola Zero (1 A 2 x4,99 9,98A      |Napoj Cola Zero 11 A 2 x4,99 9,98A      
OPUST -2,40                             |OPUST -2,40                             |OPUST -2,40                             
7,58A           

### Get products

In [26]:
# Split content string by new lines
content_split = content.split('\n')

In [27]:
# Remove empty lines
content_split = list(filter(lambda x: x != '', content_split))

In [28]:
# Consider discount in split content
split_content_new = []

i = 0
while i < len(content_split):
    line = content_split[i]
    
    if 'OPUST' in line:
        split_content_new[-1] += [line, content_split[i+1]]
        i += 2
            
    else:
        split_content_new.append([line])
        i += 1

In [29]:
split_content_new

[['PassataSot tGus/08q b 1 x3,99 3,990'],
 ['Sos Madera 250nt 5 1 x3,49 3,498'],
 ['PtNabtyszczFinishd00 A 1 x13,99 13, 99A'],
 ['Sol Kraft 2kg A 1 x5,95 5,954'],
 ['SprayProntoZ50al A 11 x8,99 8,99A'],
 ['PiwoHahoud , 331 R 2 x4,99 9,58A'],
 ['Napoj Cola Zero 11 A 2 x4,99 9,98A', 'OPUST -2,40', '7,584'],
 ['BrefBriltSpring2x42g A 1 x10,99 10,994'],
 ['Gru Kon Luz B 1,040 x6,95 7,230'],
 ['Huszyna-Skarb 1,51 A 6 xt,59 9,548'],
 ['Hoda Nat LGaz 1,51 A 6 x1,89 11,34A'],
 ['Cebula tkg apak 0 1 3,49 3,490'],
 ['Brokut 500g szt 0 1 x4,49 4,490'],
 ['CZ0SNEK szt p 2 x1,79 3,580'],
 ['Winogrono jas luz D 0,496 x13,29 6,590', 'OPUST 3,42', '3,470'],
 ['Banan luz p> 9,762 x5,69 4,340'],
 ['Butkaseznsienlno/g D 6 x0,79 4,740'],
 ['PAPR CZER PL LUZ 0 0,784 x12,99 10,180', 'OPUST -5,4', '4,700'],
 ['Wafle Mix 400g 0 4 1,89 1,890'],
 ['Wafle Hix 100g B 1 1,89 1,890'],
 ['Jog Fruvitads 408g D 4 x2,28 11,400'],
 ['pap, pon, fasbhunz2y D 1 x4,94 4,940'],
 ['Ser Relikate 150g 0 1 x3,76 3, 760'],
 ['TAP

In [30]:
# Define regular expression patterns
product_regex = re.compile(r'(.+)\s+\S{1,2}\s+(\d+(,\d+)?) x?([t\d]+,\s?\d{2}) (\d+[,.]\s?\d{0,2})')
discount_regex = re.compile(r'OPUST [-]?(\d+[,.]?\s?\d{0,2})')
total_product_price_regex = re.compile(r'\d+[,.]\s?\d{0,2}')

In [31]:
product_result = product_regex.match(split_content_new[0][0])
product_result

<re.Match object; span=(0, 34), match='PassataSot tGus/08q b 1 x3,99 3,99'>

In [32]:
product_result.groups()

('PassataSot tGus/08q', '1', None, '3,99', '3,99')

In [33]:
product_result.group(1)

'PassataSot tGus/08q'

In [40]:
discount_regex = re.compile(r'OPUST -?(\w+)[,. ]+(\w+)')

In [41]:
discount_result = discount_regex.search(' OPUST 1 82')
discount_result

<re.Match object; span=(1, 11), match='OPUST 1 82'>

In [47]:
discount_result.groups()[0]

'1'

In [36]:
total_product_price_result = total_product_price_regex.match('9,580')
total_product_price_result

<re.Match object; span=(0, 4), match='9,58'>

In [37]:
total_product_price_result.group()

'9,58'

In [None]:
# Get dictionary with products
products = []
for item in split_content_new:
    result = product_regex.match(item[0])
    
    if result:
        qty_price_regex = re.compile(r'\s?,\s?')
        
        product = {
            'name': result.group(1),
            'qty': qty_price_regex.sub('.', result.group(2)),
            'unit_price': qty_price_regex.sub('.', result.group(4)),
            'total_price': qty_price_regex.sub('.', result.group(5))
        }
        
        if len(item) > 1:
            # Get discount
            discount = discount_regex.search(item[1]).group(1)
            discount = qty_price_regex.sub('.', discount)
            
            # Get total price with discount
            final_price = total_product_price_regex.match(item[2]).group()
            final_price = qty_price_regex.sub('.', final_price)
            
        else:
            discount = None
            final_price = product['total_price']
        
        product['total_discount'] = discount
        product['total_price_with_discount'] = final_price
        
        # Convert to numeric
        for key in list(product.keys())[1:]:
            try:
                product[key] = float(product[key])
            except ValueError as e:
                print(f'\nError occurred for item: {item}')
                print(e)
            
                # Set correct value
                while True:
                    try:
                        value = float(input('Enter correct value: '))
                    except ValueError:
                        print('Wrong value! Enter correct number!')
                    else:
                        product[key] = value
                        break
            except TypeError as e:
                if product[key] is None:
                    pass
                else:
                    raise TypeError(e)

        # Add product to list
        products.append(product)

In [None]:
products

In [None]:
products_df = pd.DataFrame(products)
products_df

In [None]:
products_df.iloc[:, -1].sum()

In [None]:
for item in products:
    print(f"{item['name']:<25} {item['qty']:<5} x{item['unit_price']:<10} {item['total_price']}")

In [None]:
len(products)

### Assembly contents

In [None]:
extracted_content = {
    'shop_name': shopname,
    'items': products,
    'total_sum': total_sum
}

In [None]:
with open('content.json', 'w') as f:
    json.dump(extracted_content, f, indent=4)

## Process extracted content

In [5]:
extracted_content_filepath = 'results/Paragon_2022-08-11_081131_300dpi/extracted_content.json'
with open(extracted_content_filepath) as f:
    extracted_content = json.load(f)

In [6]:
extracted_content

{'content_filepath': 'C:\\Users\\Michal\\Documents\\GitHub\\OCR-Receipts\\results\\Paragon_2022-08-11_081131_300dpi\\raw_content.txt',
 'shop_name': 'Biedronka',
 'items': [{'name': 'PassataSot tGus/08q',
   'qty': 1.0,
   'unit_price': 3.99,
   'total_price': 3.99,
   'total_discount': None,
   'final_price': 3.99},
  {'name': 'Sos Madera 250nt',
   'qty': 1.0,
   'unit_price': 3.49,
   'total_price': 3.49,
   'total_discount': None,
   'final_price': 3.49},
  {'name': 'PtNabtyszczFinishd00',
   'qty': 1.0,
   'unit_price': 13.99,
   'total_price': 13.99,
   'total_discount': None,
   'final_price': 13.99},
  {'name': 'Sol Kraft 2kg',
   'qty': 1.0,
   'unit_price': 5.95,
   'total_price': 5.95,
   'total_discount': None,
   'final_price': 5.95},
  {'name': 'SprayProntoZ50al',
   'qty': 11.0,
   'unit_price': 8.99,
   'total_price': 8.99,
   'total_discount': None,
   'final_price': 8.99},
  {'name': 'PiwoHahoud , 331',
   'qty': 2.0,
   'unit_price': 4.99,
   'total_price': 9.58,
   

In [7]:
items_df = pd.DataFrame(extracted_content['items'])
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,False,9.54,,9.54


### Check incorrectly extracted properties

In [8]:
# Check if there are items with incorrect properties
incorrect_items_df = items_df.loc[(items_df == False).any(axis=1)]
incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
9,"Huszyna-Skarb 1,51",6.0,False,9.54,,9.54
27,Pier z seren 400g,2.0,5.7,11.4,False,9.58


In [9]:
# Correct incorrect properties
props = items_df.columns[1:]

corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    item = incorrect_items_df.loc[i]
    print(f'\n{item}')
    
    incorrect_properties = item.loc[item == False]
    
    for prop_name, _ in incorrect_properties.items():
        print(f'\nIncorrect value for property "{prop_name}"')
        
        # Get correct value
        value = process_content.string_to_float(input('Enter correct value: '))
        
        # Set correct value in new df
        corrected_incorrect_items_df.loc[i, prop_name] = value


name              Huszyna-Skarb 1,51
qty                              6.0
unit_price                     False
total_price                     9.54
total_discount                  None
final_price                     9.54
Name: 9, dtype: object

Incorrect value for property "unit_price"
Enter correct value: 1,59

name              Pier z seren 400g
qty                             2.0
unit_price                      5.7
total_price                    11.4
total_discount                False
final_price                    9.58
Name: 27, dtype: object

Incorrect value for property "total_discount"
Enter correct value: 1,82


In [10]:
corrected_incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54
27,Pier z seren 400g,2.0,5.7,11.4,1.82,9.58


In [11]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df[props])
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54


In [12]:
# Check if incorrect properties still exist
items_df.loc[(items_df == False).any(axis=1)]

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price


### Check prices

In [13]:
# Ask user if extracted total_sum is correct
total_sum = extracted_content['total_sum']
print(f'Extracted total sum is {total_sum}')
do_get_total_sum = input('Is it correct (y/n)?')
if do_get_total_sum == 'y':
    pass
elif do_get_total_sum == 'n':
    total_sum = float(input('Enter correct value: '))

Extracted total sum is 274.28


KeyboardInterrupt: Interrupted by user

In [None]:
# Check if calculated total equals extracted total
diff = abs(items_df['final_price'].sum() - extracted_content['total_sum'])
if diff == 0:
    print('Calculater and extracted total sum are equal')
else:
    print('Calculated and extracted total sum are not equal')
    print(f'The difference is {diff:.2f}')

In [None]:
# Check if total_price-total_discount=final_price
discount_items_df = items_df.loc[~items_df['total_discount'].isna()]

query = (discount_items_df['total_price'] - discount_items_df['total_discount']) != discount_items_df['final_price']
incorrect_items_df = discount_items_df.loc[query]
incorrect_items_df

In [None]:
# Correct incorrect prices
props = ['total_price', 'total_discount', 'final_price']

corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    while True:
        values = {}
        
        item = incorrect_items_df.loc[i]
        print(f'\n{item[["name", *props]]}')
    
        for prop in props:
            print(f'\nProperty: {prop}, value: {item[prop]}')
            
            # Set new value or skip
            value = input('Enter new value or press enter to skip: ')
            
            value = process_content.string_to_float(value) if value != '' else item[prop]
            values[prop] = value
            
        is_correct = round(values['total_price'] - values['total_discount'], 2) == values['final_price']
        if is_correct:
            break
        else:
            print('\nPrices were not set correctly. Try again... ')
        
    # Set correct value in new df
    corrected_incorrect_items_df.loc[i, props] = values

In [None]:
corrected_incorrect_items_df

In [None]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df[props])
items_df

In [None]:
items_df.dtypes

In [None]:
# Set correct dtypes
items_df[items_df.columns[1:]] = items_df[items_df.columns[1:]].astype('float')
items_df.dtypes

In [None]:
# Check if incorrect properties still exist
query = round(items_df['total_price'] - items_df['total_discount'], 2) != items_df['final_price']

incorrect_items_df = items_df.loc[(query) & (~items_df['total_discount'].isna())]
incorrect_items_df

In [None]:
# Check if qty*unit_price=total_price
props = ('qty', 'unit_price', 'total_price')

query = round(items_df['qty'] * items_df['unit_price'], 2) != items_df['total_price']

incorrect_items_df = items_df.loc[query]
incorrect_items_df

In [None]:
# Correct incorrect prices
corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    while True:
        values = {}
        
        item = incorrect_items_df.loc[i]
        print(f'\n{item[["name", *props]]}')
        
        is_discount = not pd.isna(item['total_discount'])
    
        for prop in props:
            print(f'\nProperty: "{prop}", value: {item[prop]}')
            
            # Set new value or skip
            value = input('Enter new value or press enter to skip: ')
            
            value = process_content.string_to_float(value) if value != '' else item[prop]
            values[prop] = value
            
        is_correct = round(values['qty'] * values['unit_price'], 2) == values['total_price']
        if is_correct:
            if is_discount:
                # Check if total_price-total_discount=final_price
                is_correct_2 = round(values['total_price'] - item['total_discount'], 2) == item['final_price']
                if not is_correct_2:
                    print('Prices including discount were not set correctly. Try again... ')
            else:
                # Set value also for final_price
                values['final_price'] = values['total_price']
            break
        else:
            print('Prices were not set correctly. Try again... ')
        
    # Set correct value in new df
    corrected_incorrect_items_df.loc[i, list(values.keys())] = values

In [None]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df)
items_df

In [None]:
round(items_df['final_price'].sum(), 2) == extracted_content['total_sum']

In [None]:
r = items_df.to_json(orient='table', index=False)
parsed = json.loads(r)
print(json.dumps(parsed, indent=4))

In [None]:
parsed['data']

In [None]:
extracted_content['items']