## Import modules

In [161]:
import json
import os
import re

import numpy as np
import pandas as pd

In [2]:
os.chdir('./..')
os.getcwd()

'C:\\Users\\Michal\\Documents\\GitHub\\OCR-Receipts'

In [182]:
from scripts.content_detector import process_image
from scripts.content_detector import process_content

## Load raw content

In [4]:
content_filepath = 'results/Paragon_2022-08-11_081131_300dpi/raw_content.txt'
ref_content_filepath = 'notebooks/Paragon_text.txt'

In [5]:
with open(content_filepath, encoding='utf-8') as f:
    raw_content = f.read()

with open(ref_content_filepath, encoding='utf-8') as f:
    ref_content = f.read()

In [6]:
# Compare with reference
for line_ref, line_raw in zip(ref_content.split('\n'), raw_content.split('\n')):
    print(f'{line_ref:<40} | {line_raw:<40}')

BIEDRONKA “CODZIENNIE NISKIE CENY” 4884  | BIEDRONKA “CODZIENNIE NESKIE CENY” 4884 
02-690 WARSZAWA UL. BOKSERSKA 61         | 82-690 WARSZAWA UL. BOKSERSKA 64        
JERONIMO MARTINS POLSKA S.A.             | JERONIMO MARTINS POLSKA 5.A.            
62-025 KOSTRZYN UL.ŻNIWNA 5              | 62-025 KOSTRZYN UL. ZNIMNA 5            
                                         |                                         
NIP 7791011327 nr:185679                 | NIP 7794611327 nr 185679                
PARAGON FISKALNY                         | PARAGON FISKALNY                        
PassataSottGus700g D 1 x3,99 3,99D       | PassataSot tGus/08q b { «3,99 3,990     
Sos Madero 250ml B 1 x3,49 3,49B         | Sos Madera 250nt 5 1 «3,49 3,498        
PłNabłyszczFinish400 A 1 x13,99 13,99A   | PtNabtyszczFinishd00 A { x13,99 13, 99A 
Sól Kraft 2kg A 1 x5,95 5,95A            | Sol Kraft 2kg A { ¥5,95 5,954           
SprayPronto 250ml A 1 x8,99 8,99A        | SprayProntoZ50al A {1 ¥8,99 8,99A

In [7]:
# Get the main body of the receipt for further processing
raw_content_all, ref_content_all = raw_content, ref_content
raw_content, ref_content = raw_content.split('PARAGON FISKALNY\n')[1], ref_content.split('PARAGON FISKALNY\n')[1]

## Process content

### Get shop name

In [180]:
shopname = process_content.get_shop_name(raw_content_all)
shopname

'Biedronka'

### Get total sum

In [183]:
total_sum = process_content.get_total_sum(raw_content)
total_sum

'274,28'

### Replace wrong characters

Often `1` is incorrectly recognized as `(` and `{`.

In [8]:
content = raw_content.replace('(', '1')
content = content.replace('{', '1')

Often `x` is incorrectly recognized as `«` and `¥`.

In [9]:
content = content.replace('«', 'x')
content = content.replace('¥', 'x')

Often `~` is incorrectly recognized as `-`.

In [10]:
content = content.replace('~', '-')

Often `P` is incorrectly recognized as `?`.

In [11]:
content = content.replace('?', 'P')

In [12]:
for line_ref, line_raw, line in zip(ref_content.split('\n'), raw_content.split('\n'), content.split('\n')):
    print(f'{line_ref:<40}|{line_raw:<40}|{line:<40}')

PassataSottGus700g D 1 x3,99 3,99D      |PassataSot tGus/08q b { «3,99 3,990     |PassataSot tGus/08q b 1 x3,99 3,990     
Sos Madero 250ml B 1 x3,49 3,49B        |Sos Madera 250nt 5 1 «3,49 3,498        |Sos Madera 250nt 5 1 x3,49 3,498        
PłNabłyszczFinish400 A 1 x13,99 13,99A  |PtNabtyszczFinishd00 A { x13,99 13, 99A |PtNabtyszczFinishd00 A 1 x13,99 13, 99A 
Sól Kraft 2kg A 1 x5,95 5,95A           |Sol Kraft 2kg A { ¥5,95 5,954           |Sol Kraft 2kg A 1 x5,95 5,954           
SprayPronto 250ml A 1 x8,99 8,99A       |SprayProntoZ50al A {1 ¥8,99 8,99A       |SprayProntoZ50al A 11 x8,99 8,99A       
PiwoMahou0,33l A 2 x4,99 9,98A          |PiwoHahoud , 331 R 2 x4,99 9,58A        |PiwoHahoud , 331 R 2 x4,99 9,58A        
Napój Cola Zero 1l A 2 x4,99 9,98A      |Napoj Cola Zero (1 A 2 x4,99 9,98A      |Napoj Cola Zero 11 A 2 x4,99 9,98A      
OPUST -2,40                             |OPUST -2,40                             |OPUST -2,40                             
7,58A           

In [13]:
# Write pre-processed content to file
with open('content.txt', 'w') as f:
    f.write(content)

### Get products

In [24]:
# Split content string by new lines
content_split = content.split('\n')

In [73]:
# Remove empty lines
content_split = list(filter(lambda x: x != '', content_split))

In [74]:
# Consider discount in split content
split_content_new = []

i = 0
while i < len(content_split):
    line = content_split[i]
    
    if 'OPUST' in line:
        split_content_new[-1] += [line, content_split[i+1]]
        i += 2
            
    else:
        split_content_new.append([line])
        i += 1

In [75]:
split_content_new

[['PassataSot tGus/08q b 1 x3,99 3,990'],
 ['Sos Madera 250nt 5 1 x3,49 3,498'],
 ['PtNabtyszczFinishd00 A 1 x13,99 13, 99A'],
 ['Sol Kraft 2kg A 1 x5,95 5,954'],
 ['SprayProntoZ50al A 11 x8,99 8,99A'],
 ['PiwoHahoud , 331 R 2 x4,99 9,58A'],
 ['Napoj Cola Zero 11 A 2 x4,99 9,98A', 'OPUST -2,40', '7,584'],
 ['BrefBriltSpring2x42g A 1 x10,99 10,994'],
 ['Gru Kon Luz B 1,040 x6,95 7,230'],
 ['Huszyna-Skarb 1,51 A 6 xt,59 9,548'],
 ['Hoda Nat LGaz 1,51 A 6 x1,89 11,34A'],
 ['Cebula tkg apak 0 1 3,49 3,490'],
 ['Brokut 500g szt 0 1 x4,49 4,490'],
 ['CZ0SNEK szt p 2 x1,79 3,580'],
 ['Winogrono jas luz D 0,496 x13,29 6,590', 'OPUST 3,42', '3,470'],
 ['Banan luz p> 9,762 x5,69 4,340'],
 ['Butkaseznsienlno/g D 6 x0,79 4,740'],
 ['PAPR CZER PL LUZ 0 0,784 x12,99 10,180', 'OPUST -5,4', '4,700'],
 ['Wafle Mix 400g 0 4 1,89 1,890'],
 ['Wafle Hix 100g B 1 1,89 1,890'],
 ['Jog Fruvitads 408g D 4 x2,28 11,400'],
 ['pap, pon, fasbhunz2y D 1 x4,94 4,940'],
 ['Ser Relikate 150g 0 1 x3,76 3, 760'],
 ['TAP

In [246]:
# Define regular expression patterns
product_regex = re.compile(r'(.+)\s+\S{1,2}\s+(\d+(,\d+)?) x?([t\d]+,\s?\d{2}) (\d+[,.]\s?\d{0,2})')
discount_regex = re.compile(r'OPUST [-]?(\d+[,.]?\s?\d{0,2})')
total_product_price_regex = re.compile(r'\d+[,.]\s?\d{0,2}')

In [247]:
product_result = product_regex.match(split_content_new[0][0])
product_result

<re.Match object; span=(0, 34), match='PassataSot tGus/08q b 1 x3,99 3,99'>

In [235]:
product_result.groups()

('PassataSot tGus/08q', '1', None, '3,99', '3,99')

In [236]:
product_result.group(1)

'PassataSot tGus/08q'

In [237]:
discount_result = discount_regex.search(' OPUST 1 82')
discount_result

<re.Match object; span=(1, 11), match='OPUST 1 82'>

In [238]:
discount_result.group(1)

'1 82'

In [239]:
total_product_price_result = total_product_price_regex.match('9,580')
total_product_price_result

<re.Match object; span=(0, 4), match='9,58'>

In [211]:
total_product_price_result.group()

'9,58'

In [251]:
# Get dictionary with products
products = []
for item in split_content_new:
    result = product_regex.match(item[0])
    
    if result:
        qty_price_regex = re.compile(r'\s?,\s?')
        
        product = {
            'name': result.group(1),
            'qty': qty_price_regex.sub('.', result.group(2)),
            'unit_price': qty_price_regex.sub('.', result.group(4)),
            'total_price': qty_price_regex.sub('.', result.group(5))
        }
        
        if len(item) > 1:
            # Get discount
            discount = discount_regex.search(item[1]).group(1)
            discount = qty_price_regex.sub('.', discount)
            
            # Get total price with discount
            final_price = total_product_price_regex.match(item[2]).group()
            final_price = qty_price_regex.sub('.', final_price)
            
        else:
            discount = None
            final_price = product['total_price']
        
        product['total_discount'] = discount
        product['total_price_with_discount'] = final_price
        
        # Convert to numeric
        for key in list(product.keys())[1:]:
            try:
                product[key] = float(product[key])
            except ValueError as e:
                print(f'\nError occurred for item: {item}')
                print(e)
            
                # Set correct value
                while True:
                    try:
                        value = float(input('Enter correct value: '))
                    except ValueError:
                        print('Wrong value! Enter correct number!')
                    else:
                        product[key] = value
                        break
            except TypeError as e:
                if product[key] is None:
                    pass
                else:
                    raise TypeError(e)

        # Add product to list
        products.append(product)


Error occurred for item: ['Huszyna-Skarb 1,51 A 6 xt,59 9,548']
could not convert string to float: 't.59'
Enter correct value: 1

Error occurred for item: ['Pier z seren 400g 0 2 x5,70 11,400', 'OPUST -1 82', '9,580']
could not convert string to float: '1 82'
Enter correct value: 1


In [244]:
products

[{'name': 'PassataSot tGus/08q',
  'qty': 1.0,
  'unit_price': 3.99,
  'total_price': 3.99,
  'total_discount': None,
  'total_price_with_discount': 3.99},
 {'name': 'Sos Madera 250nt',
  'qty': 1.0,
  'unit_price': 3.49,
  'total_price': 3.49,
  'total_discount': None,
  'total_price_with_discount': 3.49},
 {'name': 'PtNabtyszczFinishd00',
  'qty': 1.0,
  'unit_price': 13.99,
  'total_price': 13.99,
  'total_discount': None,
  'total_price_with_discount': 13.99},
 {'name': 'Sol Kraft 2kg',
  'qty': 1.0,
  'unit_price': 5.95,
  'total_price': 5.95,
  'total_discount': None,
  'total_price_with_discount': 5.95},
 {'name': 'SprayProntoZ50al',
  'qty': 11.0,
  'unit_price': 8.99,
  'total_price': 8.99,
  'total_discount': None,
  'total_price_with_discount': 8.99},
 {'name': 'PiwoHahoud , 331',
  'qty': 2.0,
  'unit_price': 4.99,
  'total_price': 9.58,
  'total_discount': None,
  'total_price_with_discount': 9.58},
 {'name': 'Napoj Cola Zero 11',
  'qty': 2.0,
  'unit_price': 4.99,
  'tot

In [249]:
products_df = pd.DataFrame(products)
products_df

Unnamed: 0,name,qty,unit_price,total_price,total_discount,total_price_with_discount
0,PassataSot tGus/08q,1.0,3.99,3.99,,3.99
1,Sos Madera 250nt,1.0,3.49,3.49,,3.49
2,PtNabtyszczFinishd00,1.0,13.99,13.99,,13.99
3,Sol Kraft 2kg,1.0,5.95,5.95,,5.95
4,SprayProntoZ50al,11.0,8.99,8.99,,8.99
5,"PiwoHahoud , 331",2.0,4.99,9.58,,9.58
6,Napoj Cola Zero 11,2.0,4.99,9.98,2.4,7.58
7,BrefBriltSpring2x42g,1.0,10.99,10.99,,10.99
8,Gru Kon Luz,1.04,6.95,7.23,,7.23
9,"Huszyna-Skarb 1,51",6.0,1.0,9.54,,9.54


In [250]:
products_df.iloc[:, -1].sum()

274.56

In [225]:
for item in products:
    print(f"{item['name']:<25} {item['qty']:<5} x{item['unit_price']:<10} {item['total_price']}")

PassataSot tGus/08q       1.0   x3.99       3.99
Sos Madera 250nt          1.0   x3.49       3.49
PtNabtyszczFinishd00      1.0   x13.99      13.99
Sol Kraft 2kg             1.0   x5.95       5.95
SprayProntoZ50al          11.0  x8.99       8.99
PiwoHahoud , 331          2.0   x4.99       9.58
Napoj Cola Zero 11        2.0   x4.99       9.98
BrefBriltSpring2x42g      1.0   x10.99      10.99
Gru Kon Luz               1.04  x6.95       7.23
Huszyna-Skarb 1,51        6.0   x1.59       9.54
Hoda Nat LGaz 1,51        6.0   x1.89       11.34
Cebula tkg apak           1.0   x3.49       3.49
Brokut 500g szt           1.0   x4.49       4.49
CZ0SNEK szt               2.0   x1.79       3.58
Winogrono jas luz         0.496 x13.29      6.59
Banan luz                 9.762 x5.69       4.34
Butkaseznsienlno/g        6.0   x0.79       4.74
PAPR CZER PL LUZ          0.784 x12.99      10.18
Wafle Mix 400g            4.0   x1.89       1.89
Wafle Hix 100g            1.0   x1.89       1.89
Jog Fruvitads 40

In [84]:
len(products)

48

### Assembly contents

In [186]:
extracted_content = {
    'shop_name': shopname,
    'items': products,
    'total_sum': total_sum
}

In [192]:
with open('content.json', 'w') as f:
    json.dump(extracted_content, f, indent=4)

### TODO

Allow to correct wrong values when converting types in runtime.
If option not enabled, fill wrong content with Na