## Import modules

In [2]:
import json
import os
import re

import numpy as np
import pandas as pd

In [3]:
os.chdir('./..')
os.getcwd()

'C:\\Users\\Michal\\Documents\\GitHub\\OCR-Receipts'

In [4]:
from scripts.content_detector import process_image
from scripts.content_detector import process_content

## Load raw content

In [None]:
content_filepath = 'results/Paragon_2022-08-11_081131_300dpi/raw_content.txt'
ref_content_filepath = 'notebooks/Paragon_text.txt'

In [None]:
with open(content_filepath, encoding='utf-8') as f:
    raw_content = f.read()

with open(ref_content_filepath, encoding='utf-8') as f:
    ref_content = f.read()

In [None]:
# Compare with reference
for line_ref, line_raw in zip(ref_content.split('\n'), raw_content.split('\n')):
    print(f'{line_ref:<40} | {line_raw:<40}')

In [None]:
# Get the main body of the receipt for further processing
raw_content_all, ref_content_all = raw_content, ref_content
raw_content, ref_content = raw_content.split('PARAGON FISKALNY\n')[1], ref_content.split('PARAGON FISKALNY\n')[1]

## Process content

### Get shop name

In [None]:
shopname = process_content.get_shop_name(raw_content_all)
shopname

### Get total sum

In [None]:
total_sum = process_content.get_total_sum(raw_content)
total_sum

### Replace wrong characters

Often `1` is incorrectly recognized as `(` and `{`.

In [None]:
content = raw_content.replace('(', '1')
content = content.replace('{', '1')

Often `x` is incorrectly recognized as `«` and `¥`.

In [None]:
content = content.replace('«', 'x')
content = content.replace('¥', 'x')

Often `~` is incorrectly recognized as `-`.

In [None]:
content = content.replace('~', '-')

Often `P` is incorrectly recognized as `?`.

In [None]:
content = content.replace('?', 'P')

In [None]:
for line_ref, line_raw, line in zip(ref_content.split('\n'), raw_content.split('\n'), content.split('\n')):
    print(f'{line_ref:<40}|{line_raw:<40}|{line:<40}')

In [None]:
# Write pre-processed content to file
with open('content.txt', 'w') as f:
    f.write(content)

### Get products

In [None]:
# Split content string by new lines
content_split = content.split('\n')

In [None]:
# Remove empty lines
content_split = list(filter(lambda x: x != '', content_split))

In [None]:
# Consider discount in split content
split_content_new = []

i = 0
while i < len(content_split):
    line = content_split[i]
    
    if 'OPUST' in line:
        split_content_new[-1] += [line, content_split[i+1]]
        i += 2
            
    else:
        split_content_new.append([line])
        i += 1

In [None]:
split_content_new

In [None]:
# Define regular expression patterns
product_regex = re.compile(r'(.+)\s+\S{1,2}\s+(\d+(,\d+)?) x?([t\d]+,\s?\d{2}) (\d+[,.]\s?\d{0,2})')
discount_regex = re.compile(r'OPUST [-]?(\d+[,.]?\s?\d{0,2})')
total_product_price_regex = re.compile(r'\d+[,.]\s?\d{0,2}')

In [None]:
product_result = product_regex.match(split_content_new[0][0])
product_result

In [None]:
product_result.groups()

In [None]:
product_result.group(1)

In [None]:
discount_result = discount_regex.search(' OPUST 1 82')
discount_result

In [None]:
discount_result.group(1)

In [None]:
total_product_price_result = total_product_price_regex.match('9,580')
total_product_price_result

In [None]:
total_product_price_result.group()

In [None]:
# Get dictionary with products
products = []
for item in split_content_new:
    result = product_regex.match(item[0])
    
    if result:
        qty_price_regex = re.compile(r'\s?,\s?')
        
        product = {
            'name': result.group(1),
            'qty': qty_price_regex.sub('.', result.group(2)),
            'unit_price': qty_price_regex.sub('.', result.group(4)),
            'total_price': qty_price_regex.sub('.', result.group(5))
        }
        
        if len(item) > 1:
            # Get discount
            discount = discount_regex.search(item[1]).group(1)
            discount = qty_price_regex.sub('.', discount)
            
            # Get total price with discount
            final_price = total_product_price_regex.match(item[2]).group()
            final_price = qty_price_regex.sub('.', final_price)
            
        else:
            discount = None
            final_price = product['total_price']
        
        product['total_discount'] = discount
        product['total_price_with_discount'] = final_price
        
        # Convert to numeric
        for key in list(product.keys())[1:]:
            try:
                product[key] = float(product[key])
            except ValueError as e:
                print(f'\nError occurred for item: {item}')
                print(e)
            
                # Set correct value
                while True:
                    try:
                        value = float(input('Enter correct value: '))
                    except ValueError:
                        print('Wrong value! Enter correct number!')
                    else:
                        product[key] = value
                        break
            except TypeError as e:
                if product[key] is None:
                    pass
                else:
                    raise TypeError(e)

        # Add product to list
        products.append(product)

In [None]:
products

In [None]:
products_df = pd.DataFrame(products)
products_df

In [None]:
products_df.iloc[:, -1].sum()

In [None]:
for item in products:
    print(f"{item['name']:<25} {item['qty']:<5} x{item['unit_price']:<10} {item['total_price']}")

In [None]:
len(products)

### Assembly contents

In [None]:
extracted_content = {
    'shop_name': shopname,
    'items': products,
    'total_sum': total_sum
}

In [None]:
with open('content.json', 'w') as f:
    json.dump(extracted_content, f, indent=4)

## Process extracted content

In [5]:
extracted_content_filepath = 'results/Paragon_2022-08-11_081131_300dpi/extracted_content.json'
with open(extracted_content_filepath) as f:
    extracted_content = json.load(f)

In [6]:
extracted_content

{'content_filepath': 'C:\\Users\\Michal\\Documents\\GitHub\\OCR-Receipts\\results\\Paragon_2022-08-11_081131_300dpi\\raw_content.txt',
 'shop_name': 'Biedronka',
 'items': [{'name': 'PassataSot tGus/08q',
   'qty': 1.0,
   'unit_price': 3.99,
   'total_price': 3.99,
   'total_discount': None,
   'final_price': 3.99},
  {'name': 'Sos Madera 250nt',
   'qty': 1.0,
   'unit_price': 3.49,
   'total_price': 3.49,
   'total_discount': None,
   'final_price': 3.49},
  {'name': 'PtNabtyszczFinishd00',
   'qty': 1.0,
   'unit_price': 13.99,
   'total_price': 13.99,
   'total_discount': None,
   'final_price': 13.99},
  {'name': 'Sol Kraft 2kg',
   'qty': 1.0,
   'unit_price': 5.95,
   'total_price': 5.95,
   'total_discount': None,
   'final_price': 5.95},
  {'name': 'SprayProntoZ50al',
   'qty': 11.0,
   'unit_price': 8.99,
   'total_price': 8.99,
   'total_discount': None,
   'final_price': 8.99},
  {'name': 'PiwoHahoud , 331',
   'qty': 2.0,
   'unit_price': 4.99,
   'total_price': 9.58,
   

In [7]:
items_df = pd.DataFrame(extracted_content['items'])
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,False,9.54,,9.54


### Check incorrectly extracted properties

In [8]:
# Check if there are items with incorrect properties
incorrect_items_df = items_df.loc[(items_df == False).any(axis=1)]
incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
9,"Huszyna-Skarb 1,51",6.0,False,9.54,,9.54
27,Pier z seren 400g,2.0,5.7,11.4,False,9.58


In [10]:
# Correct incorrect properties
props = items_df.columns[1:]

corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    item = incorrect_items_df.loc[i]
    print(f'\n{item}')
    
    incorrect_properties = item.loc[item == False]
    
    for prop_name, _ in incorrect_properties.items():
        print(f'\nIncorrect value for property "{prop_name}"')
        
        # Get correct value
        value = process_content.string_to_float(input('Enter correct value: '))
        
        # Set correct value in new df
        corrected_incorrect_items_df.loc[i, prop_name] = value


name              Huszyna-Skarb 1,51
qty                              6.0
unit_price                     False
total_price                     9.54
total_discount                  None
final_price                     9.54
Name: 9, dtype: object

Incorrect value for property "unit_price"
Enter correct value: 1,59

name              Pier z seren 400g
qty                             2.0
unit_price                      5.7
total_price                    11.4
total_discount                False
final_price                    9.58
Name: 27, dtype: object

Incorrect value for property "total_discount"
Enter correct value: 1,82


In [11]:
corrected_incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54
27,Pier z seren 400g,2.0,5.7,11.4,1.82,9.58


In [15]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df[props])
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54


In [16]:
# Check if incorrect properties still exist
items_df.loc[(items_df == False).any(axis=1)]

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price


### Check prices

In [18]:
# Check if calculated total equals extracted total
diff = abs(items_df['final_price'].sum() - extracted_content['total_sum'])
if diff == 0:
    print('Calculater and extracted total sum are equal')
else:
    print('Calculated and extracted total sum are not equal')
    print(f'The difference is {diff:.2f}')

Calculated and extracted total sum are not equal
The difference is 0.28


In [19]:
# Check if total_price-total_discount=final_price
discount_items_df = items_df.loc[~items_df['total_discount'].isna()]

query = (discount_items_df['total_price'] - discount_items_df['total_discount']) != discount_items_df['final_price']
incorrect_items_df = discount_items_df.loc[query]
incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
14,Winogrono jas luz,0.496,13.29,6.59,3.42,3.47
17,PAPR CZER PL LUZ,0.784,12.99,10.18,5.4,4.7
47,LodyMarlettodix120,4.0,2.56,2.56,4.56,1.6


In [31]:
# Correct incorrect prices
props = ['total_price', 'total_discount', 'final_price']

corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    while True:
        values = {}
        
        item = incorrect_items_df.loc[i]
        print(f'\n{item[["name", *props]]}')
    
        for prop in props:
            print(f'\nProperty: {prop}, value: {item[prop]}')
            
            # Set new value or skip
            value = input('Enter new value or press enter to skip: ')
            
            value = process_content.string_to_float(value) if value != '' else item[prop]
            values[prop] = value
            
        is_correct = round(values['total_price'] - values['total_discount'], 2) == values['final_price']
        if is_correct:
            break
        else:
            print('\nPrices were not set correctly. Try again... ')
        
    # Set correct value in new df
    corrected_incorrect_items_df.loc[i, props] = values


name              Winogrono jas luz
total_price                    6.59
total_discount                 3.42
final_price                    3.47
Name: 14, dtype: object

Property: total_price, value: 6.59
Enter new value or press enter to skip: 

Property: total_discount, value: 3.42
Enter new value or press enter to skip: 3,12

Property: final_price, value: 3.47
Enter new value or press enter to skip: 

name              PAPR CZER PL LUZ
total_price                  10.18
total_discount                 5.4
final_price                    4.7
Name: 17, dtype: object

Property: total_price, value: 10.18
Enter new value or press enter to skip: 

Property: total_discount, value: 5.4
Enter new value or press enter to skip: 5,48

Property: final_price, value: 4.7
Enter new value or press enter to skip: 

name              LodyMarlettodix120
total_price                     2.56
total_discount                  4.56
final_price                      1.6
Name: 47, dtype: object

Property: total_p

In [32]:
corrected_incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
14,Winogrono jas luz,0.496,13.29,6.59,3.12,3.47
17,PAPR CZER PL LUZ,0.784,12.99,10.18,5.48,4.7
47,LodyMarlettodix120,4.0,2.56,2.56,1.56,1.0


In [33]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df[props])
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54


In [34]:
items_df.dtypes

name               object
qty               float64
unit_price         object
total_price       float64
total_discount     object
final_price       float64
dtype: object

In [35]:
# Set correct dtypes
items_df[items_df.columns[1:]] = items_df[items_df.columns[1:]].astype('float')
items_df.dtypes

name               object
qty               float64
unit_price        float64
total_price       float64
total_discount    float64
final_price       float64
dtype: object

In [38]:
# Check if incorrect properties still exist
query = round(items_df['total_price'] - items_df['total_discount'], 2) != items_df['final_price']

incorrect_items_df = items_df.loc[(query) & (~items_df['total_discount'].isna())]
incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price


In [50]:
# Check if qty*unit_price=total_price
props = ('qty', 'unit_price', 'total_price')

query = round(items_df['qty'] * items_df['unit_price'], 2) != items_df['total_price']

incorrect_items_df = items_df.loc[query]
incorrect_items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
15,Banan luz,9.762,5.69,4.34,,4.34
18,Wafle Mix 400g,4.0,1.89,1.89,,1.89
20,Jog Fruvitads 408g,4.0,2.28,11.4,,11.4
23,TAPAS HISZP 120g,1.0,45.99,5.99,,5.99
24,PizzaProshB43aq,4.0,314.99,14.99,,14.99
25,Pizzalberyjska600q,4.0,13.49,13.49,,13.49
26,Recznik Milla X2,1.0,9.99,3.99,,3.99
34,WarNaPatHor Mix45aq,11.0,5.23,5.23,1.24,3.99


In [61]:
# Correct incorrect prices
corrected_incorrect_items_df = incorrect_items_df.copy()

for i in incorrect_items_df.index:
    while True:
        values = {}
        
        item = incorrect_items_df.loc[i]
        print(f'\n{item[["name", *props]]}')
        
        is_discount = not pd.isna(item['total_discount'])
    
        for prop in props:
            print(f'\nProperty: "{prop}", value: {item[prop]}')
            
            # Set new value or skip
            value = input('Enter new value or press enter to skip: ')
            
            value = process_content.string_to_float(value) if value != '' else item[prop]
            values[prop] = value
            
        is_correct = round(values['qty'] * values['unit_price'], 2) == values['total_price']
        if is_correct:
            if is_discount:
                # Check if total_price-total_discount=final_price
                is_correct_2 = round(values['total_price'] - item['total_discount'], 2) == item['final_price']
                if not is_correct_2:
                    print('Prices including discount were not set correctly. Try again... ')
            else:
                # Set value also for final_price
                values['final_price'] = values['total_price']
            break
        else:
            print('Prices were not set correctly. Try again... ')
        
    # Set correct value in new df
    corrected_incorrect_items_df.loc[i, list(values.keys())] = values


name           SprayProntoZ50al
qty                        11.0
unit_price                 8.99
total_price                8.99
Name: 4, dtype: object

Property: "qty", value: 11.0
Enter new value or press enter to skip: 1

Property: "unit_price", value: 8.99
Enter new value or press enter to skip: 

Property: "total_price", value: 8.99
Enter new value or press enter to skip: 

name           PiwoHahoud , 331
qty                         2.0
unit_price                 4.99
total_price                9.58
Name: 5, dtype: object

Property: "qty", value: 2.0
Enter new value or press enter to skip: 

Property: "unit_price", value: 4.99
Enter new value or press enter to skip: 

Property: "total_price", value: 9.58
Enter new value or press enter to skip: 
Prices were not set correctly. Try again... 

name           PiwoHahoud , 331
qty                         2.0
unit_price                 4.99
total_price                9.58
Name: 5, dtype: object

Property: "qty", value: 2.0
Enter new valu

In [63]:
# Set corrected df - update the reference df
items_df.update(corrected_incorrect_items_df)
items_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,final_price
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,1.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.98,,9.98
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,1.59,9.54,,9.54


In [67]:
round(items_df['final_price'].sum(), 2) == extracted_content['total_sum']

True