# Преобразование датасета из Толоки в датасет для бенчмарка 

In [1]:
import cv2
import pandas as pd
import os


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
!pwd

/home/kirillk/PycharmProjects/useful_notebooks


In [3]:
os.listdir('/home/kirillk/Documents/yandex_toloka/ocr_schet_factura_1/')

['tasks_for_pool_1186103__26-05-2022.tsv',
 'assignments_from_pool_1186103__27-05-2022.tsv']

## Загрузка исходного датасета и первичные преобразования

In [4]:
df = pd.read_csv('/home/kirillk/Documents/yandex_toloka/ocr_schet_factura_1/assignments_ocr_schet_factura.csv',
                sep='\t')

In [5]:
df.columns = ['filename', 'rectangles', 'doc_type', 'doc_number', 'doc_date']

In [6]:
df['filename'] = df['filename'].apply(lambda x: x.split('/')[-1])

In [7]:
df['scan_is_good'] = True

In [8]:
df.head()

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good
0,190.jpg,"{""shape"":""rectangle""\,""left"":0.059184611970619...",сф,6017637496,04.03.2022,True
1,084.jpg,"{""shape"":""rectangle""\,""left"":0.041617963443564...",сф,6600208435,15.11.2021,True
2,036.jpg,"{""shape"":""rectangle""\,""left"":0.025688048801457...",сф,60172667909,15.12.2021,True
3,031.jpg,"{""shape"":""rectangle""\,""left"":0.079951683602433...",сф,6017257557,10.12.2021,True
4,043.jpg,"{""shape"":""rectangle""\,""left"":0.035432585062619...",сф,6017291593,28.12.2021,True


In [9]:
df.shape

(200, 6)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   filename      200 non-null    object
 1   rectangles    200 non-null    object
 2   doc_type      200 non-null    object
 3   doc_number    200 non-null    int64 
 4   doc_date      200 non-null    object
 5   scan_is_good  200 non-null    bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 8.1+ KB


## Подготовка данных в датасете

### Подготовка данных в столбце 'filename'

In [12]:
sorted(df['filename'].unique())

['001.jpg',
 '002.jpg',
 '003.jpg',
 '004.jpg',
 '005.jpg',
 '006.jpg',
 '007.jpg',
 '008.jpg',
 '009.jpg',
 '010.jpg',
 '011.jpg',
 '012.jpg',
 '013.jpg',
 '014.jpg',
 '015.jpg',
 '016.jpg',
 '017.jpg',
 '018.jpg',
 '019.jpg',
 '020.jpg',
 '021.jpg',
 '022.jpg',
 '023.jpg',
 '024.jpg',
 '025.jpg',
 '026.jpg',
 '027.jpg',
 '028.jpg',
 '029.jpg',
 '030.jpg',
 '031.jpg',
 '032.jpg',
 '033.jpg',
 '034.jpg',
 '035.jpg',
 '036.jpg',
 '037.jpg',
 '038.jpg',
 '039.jpg',
 '040.jpg',
 '041.jpg',
 '042.jpg',
 '043.jpg',
 '044.jpg',
 '045.jpg',
 '046.jpg',
 '047.jpg',
 '048.jpg',
 '049.jpg',
 '050.jpg',
 '051.jpg',
 '052.jpg',
 '053.jpg',
 '054.jpg',
 '055.jpg',
 '056.jpg',
 '057.jpg',
 '058.jpg',
 '059.jpg',
 '060.jpg',
 '061.jpg',
 '062.jpg',
 '063.jpg',
 '064.jpg',
 '065.jpg',
 '066.jpg',
 '067.jpg',
 '068.jpg',
 '069.jpg',
 '070.jpg',
 '071.jpg',
 '072.jpg',
 '073.jpg',
 '074.jpg',
 '075.jpg',
 '076.jpg',
 '077.jpg',
 '078.jpg',
 '079.jpg',
 '080.jpg',
 '081.jpg',
 '082.jpg',
 '083.jpg',
 '08

Данные готовы

### Подготовка данных в столбце 'doc_type'

In [13]:
df['doc_type'].unique()

array(['сф'], dtype=object)

In [14]:
df['doc_type'] = df['doc_type'].str.lower()

In [15]:
df['doc_type'].unique()

array(['сф'], dtype=object)

### Подготовка данных в столбце 'doc_number'

In [19]:
df['doc_number'] = df['doc_number'].astype('str')

In [20]:
df[df['doc_number'].str.len() !=10]

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good
2,036.jpg,"{""shape"":""rectangle""\,""left"":0.025688048801457...",сф,60172667909,15.12.2021,True


In [21]:
df['doc_number'][2]

'60172667909'

Исправим ошибки разметки.

In [22]:
df['doc_number'][2] = '6017267909'
df['doc_number'][2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['doc_number'][2] = '6017267909'


'6017267909'

In [23]:
df[df['doc_number'].str.len() !=10]

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good


In [24]:
df[df['doc_number'].str.isnumeric() == False]

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good


Данные готовы.

### Парсинг столбца rectangles

In [25]:
rec = df['rectangles'][0]

In [26]:
rec = eval(rec.replace('\\', ''))

In [27]:
rec

({'shape': 'rectangle',
  'left': 0.05918461197061993,
  'top': 0.06259846706138808,
  'width': 0.09241069833923372,
  'height': 0.010006867495788066,
  'label': 'doc_type'},
 {'shape': 'rectangle',
  'left': 0.17587269716168627,
  'top': 0.0627837794224212,
  'width': 0.07648682094179513,
  'height': 0.010192179856821193,
  'label': 'doc_number'},
 {'shape': 'rectangle',
  'left': 0.2745485275917155,
  'top': 0.06259846706138808,
  'width': 0.0665670284647023,
  'height': 0.01111874166198673,
  'label': 'doc_date'})

In [28]:
example = pd.DataFrame(rec).set_index('label').drop('shape', axis=1).T.unstack().to_frame().T

In [29]:
example

label,doc_type,doc_type,doc_type,doc_type,doc_number,doc_number,doc_number,doc_number,doc_date,doc_date,doc_date,doc_date
Unnamed: 0_level_1,left,top,width,height,left,top,width,height,left,top,width,height
0,0.059185,0.062598,0.092411,0.010007,0.175873,0.062784,0.076487,0.010192,0.274549,0.062598,0.066567,0.011119


In [30]:
example.columns

MultiIndex([(  'doc_type',   'left'),
            (  'doc_type',    'top'),
            (  'doc_type',  'width'),
            (  'doc_type', 'height'),
            ('doc_number',   'left'),
            ('doc_number',    'top'),
            ('doc_number',  'width'),
            ('doc_number', 'height'),
            (  'doc_date',   'left'),
            (  'doc_date',    'top'),
            (  'doc_date',  'width'),
            (  'doc_date', 'height')],
           names=['label', None])

In [31]:
example.columns = ['_'.join(col) for col in example.columns]
example.columns

Index(['doc_type_left', 'doc_type_top', 'doc_type_width', 'doc_type_height',
       'doc_number_left', 'doc_number_top', 'doc_number_width',
       'doc_number_height', 'doc_date_left', 'doc_date_top', 'doc_date_width',
       'doc_date_height'],
      dtype='object')

In [32]:
'_'.join(example.columns[0].split('_')[:2])

'doc_type'

In [33]:
new_columns_list = []

In [34]:
for col in example.columns:
    new_columns_list.append('_'.join(col.split('_')[:2]) + '_box_' + col.split('_')[-1])
#     print(col)

In [35]:
new_columns_list

['doc_type_box_left',
 'doc_type_box_top',
 'doc_type_box_width',
 'doc_type_box_height',
 'doc_number_box_left',
 'doc_number_box_top',
 'doc_number_box_width',
 'doc_number_box_height',
 'doc_date_box_left',
 'doc_date_box_top',
 'doc_date_box_width',
 'doc_date_box_height']

In [36]:
example.columns = new_columns_list

In [37]:
example.columns = example.columns.str.replace('-', '_')

In [38]:
example.columns

Index(['doc_type_box_left', 'doc_type_box_top', 'doc_type_box_width',
       'doc_type_box_height', 'doc_number_box_left', 'doc_number_box_top',
       'doc_number_box_width', 'doc_number_box_height', 'doc_date_box_left',
       'doc_date_box_top', 'doc_date_box_width', 'doc_date_box_height'],
      dtype='object')

In [39]:
example

Unnamed: 0,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height,doc_date_box_left,doc_date_box_top,doc_date_box_width,doc_date_box_height
0,0.059185,0.062598,0.092411,0.010007,0.175873,0.062784,0.076487,0.010192,0.274549,0.062598,0.066567,0.011119


In [40]:
rectangles_list = []

In [41]:
for rectangles in df['rectangles']:
    rectangles = eval(rectangles.replace('\\', ''))
    rectangles = pd.DataFrame(rectangles).set_index('label').drop('shape', axis=1).T.unstack().to_frame().T
    
    rectangles.columns = ['_'.join(col) for col in rectangles.columns]
    
    new_columns_list = []
    for col in rectangles.columns:
        new_columns_list.append('_'.join(col.split('_')[:2]) + '_box_' + col.split('_')[-1])
    
    rectangles.columns = new_columns_list
    
    rectangles.columns = rectangles.columns.str.replace('-', '_')
    rectangles.columns = rectangles.columns.str.replace('name', 'type')
    rectangles_list.append(rectangles)

In [42]:
boxes_table = pd.concat(rectangles_list, ignore_index=True)

In [43]:
boxes_table.head(10)

Unnamed: 0,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height,doc_date_box_left,doc_date_box_top,doc_date_box_width,doc_date_box_height
0,0.059185,0.062598,0.092411,0.010007,0.175873,0.062784,0.076487,0.010192,0.274549,0.062598,0.066567,0.011119
1,0.041618,0.038958,0.095529,0.01186,0.162381,0.038958,0.076732,0.012045,0.261773,0.039514,0.067463,0.010933
2,0.025688,0.034857,0.092651,0.010556,0.144101,0.034187,0.074688,0.011896,0.241479,0.034522,0.065943,0.010556
3,0.079952,0.055576,0.088377,0.009839,0.192667,0.055058,0.069473,0.010875,0.284116,0.055576,0.062383,0.010012
4,0.035433,0.040401,0.092204,0.008621,0.152521,0.039651,0.073082,0.010308,0.247082,0.039838,0.065224,0.009371
5,0.040493,0.037626,0.097163,0.010563,0.162992,0.03744,0.079663,0.010933,0.265117,0.036328,0.071566,0.012045
6,0.045505,0.047221,0.094511,0.011561,0.165559,0.047724,0.075469,0.012231,0.263785,0.048561,0.070128,0.011394
7,0.036359,0.043557,0.091938,0.009659,0.152149,0.04288,0.073411,0.011354,0.248024,0.043557,0.066464,0.010506
8,0.056875,0.038264,0.097637,0.010377,0.178987,0.038079,0.079412,0.010933,0.28053,0.03845,0.071861,0.010748
9,0.042909,0.070024,0.092769,0.009886,0.160574,0.069689,0.076329,0.009718,0.25804,0.070024,0.065995,0.009215


In [44]:
df = df.join(boxes_table)

In [45]:
df.head()

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height,doc_date_box_left,doc_date_box_top,doc_date_box_width,doc_date_box_height
0,190.jpg,"{""shape"":""rectangle""\,""left"":0.059184611970619...",сф,6017637496,04.03.2022,True,0.059185,0.062598,0.092411,0.010007,0.175873,0.062784,0.076487,0.010192,0.274549,0.062598,0.066567,0.011119
1,084.jpg,"{""shape"":""rectangle""\,""left"":0.041617963443564...",сф,6600208435,15.11.2021,True,0.041618,0.038958,0.095529,0.01186,0.162381,0.038958,0.076732,0.012045,0.261773,0.039514,0.067463,0.010933
2,036.jpg,"{""shape"":""rectangle""\,""left"":0.025688048801457...",сф,6017267909,15.12.2021,True,0.025688,0.034857,0.092651,0.010556,0.144101,0.034187,0.074688,0.011896,0.241479,0.034522,0.065943,0.010556
3,031.jpg,"{""shape"":""rectangle""\,""left"":0.079951683602433...",сф,6017257557,10.12.2021,True,0.079952,0.055576,0.088377,0.009839,0.192667,0.055058,0.069473,0.010875,0.284116,0.055576,0.062383,0.010012
4,043.jpg,"{""shape"":""rectangle""\,""left"":0.035432585062619...",сф,6017291593,28.12.2021,True,0.035433,0.040401,0.092204,0.008621,0.152521,0.039651,0.073082,0.010308,0.247082,0.039838,0.065224,0.009371


### Проверка join

In [46]:
for idx, num in enumerate(df['doc_type_box_left']):
    if str(num) not in df['rectangles'][idx]:
        print(idx, 'NOT ok')

### Готовый датасет

In [48]:
df = df.sort_values(by='filename').reset_index(drop=True)
df.head()

Unnamed: 0,filename,rectangles,doc_type,doc_number,doc_date,scan_is_good,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height,doc_date_box_left,doc_date_box_top,doc_date_box_width,doc_date_box_height
0,001.jpg,"{""shape"":""rectangle""\,""left"":0.046325926841854...",сф,6600241074,23.12.2021,True,0.046326,0.037244,0.098478,0.009955,0.16967,0.036894,0.080014,0.010479,0.271595,0.037069,0.070412,0.010654
1,002.jpg,"{""shape"":""rectangle""\,""left"":0.043181886974597...",сф,6600254174,07.02.2022,True,0.043182,0.037061,0.096808,0.009625,0.166044,0.036879,0.07714,0.009806,0.267449,0.036697,0.070754,0.01017
2,003.jpg,"{""shape"":""rectangle""\,""left"":0.023974525676559...",сф,6600013447,20.01.2022,True,0.023975,0.035199,0.096616,0.011047,0.146917,0.036286,0.07898,0.00978,0.248645,0.037191,0.071823,0.00978
3,004.jpg,"{""shape"":""rectangle""\,""left"":0.061055422378549...",сф,6015828900,13.05.2021,True,0.061055,0.059328,0.094275,0.011184,0.179674,0.059328,0.075181,0.011015,0.276813,0.059328,0.067305,0.010337
4,005.jpg,"{""shape"":""rectangle""\,""left"":0.042334104467394...",сф,6016891910,22.10.2021,True,0.042334,0.043039,0.091055,0.009589,0.15673,0.043039,0.074127,0.010511,0.251377,0.042671,0.065662,0.010326


In [49]:
df = df.drop('rectangles', axis=1)

In [50]:
df.head(10)

Unnamed: 0,filename,doc_type,doc_number,doc_date,scan_is_good,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height,doc_date_box_left,doc_date_box_top,doc_date_box_width,doc_date_box_height
0,001.jpg,сф,6600241074,23.12.2021,True,0.046326,0.037244,0.098478,0.009955,0.16967,0.036894,0.080014,0.010479,0.271595,0.037069,0.070412,0.010654
1,002.jpg,сф,6600254174,07.02.2022,True,0.043182,0.037061,0.096808,0.009625,0.166044,0.036879,0.07714,0.009806,0.267449,0.036697,0.070754,0.01017
2,003.jpg,сф,6600013447,20.01.2022,True,0.023975,0.035199,0.096616,0.011047,0.146917,0.036286,0.07898,0.00978,0.248645,0.037191,0.071823,0.00978
3,004.jpg,сф,6015828900,13.05.2021,True,0.061055,0.059328,0.094275,0.011184,0.179674,0.059328,0.075181,0.011015,0.276813,0.059328,0.067305,0.010337
4,005.jpg,сф,6016891910,22.10.2021,True,0.042334,0.043039,0.091055,0.009589,0.15673,0.043039,0.074127,0.010511,0.251377,0.042671,0.065662,0.010326
5,006.jpg,сф,6016895321,25.10.2021,True,0.01556,0.036633,0.093267,0.008812,0.132381,0.035786,0.073281,0.010167,0.228503,0.035617,0.066619,0.010167
6,007.jpg,сф,6016896123,25.10.2021,True,0.041264,0.042144,0.092451,0.010845,0.156414,0.042144,0.0759,0.011015,0.251939,0.042314,0.065733,0.010167
7,008.jpg,сф,6016899456,27.10.2021,True,0.037961,0.040051,0.091984,0.010095,0.15499,0.04054,0.073086,0.009769,0.249706,0.040703,0.065573,0.009769
8,009.jpg,сф,6017071562,09.11.2021,True,0.022563,0.033631,0.093425,0.01089,0.140986,0.033631,0.076255,0.011247,0.239208,0.03381,0.067165,0.011069
9,010.jpg,сф,6017073406,09.11.2021,True,0.022622,0.034612,0.094483,0.010173,0.14154,0.034228,0.075478,0.010557,0.239825,0.034612,0.065161,0.009789


## Проверка на изображениях

In [51]:
mapping_prefix_value = {
    'doc_number_box_': 'doc_number',
    'doc_date_box_': None,
    'doc_type_box_': None,
}

In [52]:
for index, row in df.iterrows():
    if index > 1:
        image = cv2.imread('/home/kirillk/datasets/OCR/ocr_schet_factura_1/' + row['filename'])
        image = cv2.resize(image, (0, 0), fx=0.5, fy=0.5)
        height, width, _ = image.shape

        for prefix, value in mapping_prefix_value.items():
            x1 = int(row[prefix + 'left'] * width)
            y1 = int(row[prefix + 'top'] * height)
            x2 = int((row[prefix + 'left'] + row[prefix + 'width']) * width)
            y2 = int((row[prefix + 'top'] + row[prefix + 'height']) * height)

            image = cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 4)
            image = cv2.putText(image, 
                                str(row.get(value, default='')), 
                                (x1, y2 - 70),
                                cv2.FONT_HERSHEY_SIMPLEX,
                                #0,
                                1,
                                (0, 0, 255),
                                2)

            cv2.imshow('Document', image)

            key = cv2.waitKey(0) & 0xFF

            if key == ord('q'):
                break
    
    cv2.destroyAllWindows()
#     if index == 5:
#         break

df.info()

In [55]:
df['doc_number'][36] = '6017268120'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['doc_number'][36] = '6017268120'


In [61]:
df.loc[36]

filename                    037.jpg
doc_type                         сф
doc_number               6017268120
doc_date                 15.12.2021
scan_is_good                   True
doc_type_box_left          0.044158
doc_type_box_top           0.040868
doc_type_box_width         0.092209
doc_type_box_height        0.009731
doc_number_box_left         0.16061
doc_number_box_top         0.040057
doc_number_box_width       0.074538
doc_number_box_height      0.010866
doc_date_box_left          0.256217
doc_date_box_top           0.040543
doc_date_box_width         0.064569
doc_date_box_height        0.010055
Name: 36, dtype: object

## Сохранение датасета

In [62]:
df.to_csv('/home/kirillk/datasets/OCR/ocr_schet_factura_1/test-cases-schet-factura.csv')

In [63]:
test = pd.read_csv('/home/kirillk/datasets/OCR/ocr_schet_factura_1/test-cases-schet-factura.csv')

In [64]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             200 non-null    int64  
 1   filename               200 non-null    object 
 2   doc_type               200 non-null    object 
 3   doc_number             200 non-null    int64  
 4   doc_date               200 non-null    object 
 5   scan_is_good           200 non-null    bool   
 6   doc_type_box_left      200 non-null    float64
 7   doc_type_box_top       200 non-null    float64
 8   doc_type_box_width     200 non-null    float64
 9   doc_type_box_height    200 non-null    float64
 10  doc_number_box_left    200 non-null    float64
 11  doc_number_box_top     200 non-null    float64
 12  doc_number_box_width   200 non-null    float64
 13  doc_number_box_height  200 non-null    float64
 14  doc_date_box_left      200 non-null    float64
 15  doc_da