# Преобразование датасета из Толоки в датасет для бенчмарка 

In [2]:
import cv2
import pandas as pd
import os


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
!pwd

/home/kirillk/PycharmProjects/useful_notebooks/ocr


In [4]:
os.listdir('/home/kirillk/Documents/yandex_toloka/ocr_ttn/')

['tasks_ocr_ttn.tsv', 'assignments_ocr_ttn.tsv']

## Загрузка исходного датасета и первичные преобразования

In [5]:
df = pd.read_csv('/home/kirillk/Documents/yandex_toloka/ocr_ttn/assignments_ocr_ttn.tsv',
                sep='\t')

In [6]:
df.head(3)

Unnamed: 0,INPUT:image,OUTPUT:path1,OUTPUT:path3,OUTPUT:path4
0,/toloka-proxy-danil-01/ocr_ttn_1/001.jpg,"{""shape"":""rectangle""\,""left"":0.036891056767540...",2880107901381,0074786723/2053332022
1,/toloka-proxy-danil-01/ocr_ttn_1/002.jpg,"{""shape"":""rectangle""\,""left"":0.034492666106048...",2880108066928,0074801020/2053439854
2,/toloka-proxy-danil-01/ocr_ttn_1/003.jpg,"{""shape"":""rectangle""\,""left"":0.044061530761221...",2880108067130,0074801108/2053440150


In [7]:
df.columns = ['filename', 'rectangles', 'barcode_number', 'doc_number']

In [8]:
df['filename'] = df['filename'].apply(lambda x: x.split('/')[-1])

In [9]:
# df['scan_is_good'] = True

In [10]:
df.head()

Unnamed: 0,filename,rectangles,barcode_number,doc_number
0,001.jpg,"{""shape"":""rectangle""\,""left"":0.036891056767540...",2880107901381,0074786723/2053332022
1,002.jpg,"{""shape"":""rectangle""\,""left"":0.034492666106048...",2880108066928,0074801020/2053439854
2,003.jpg,"{""shape"":""rectangle""\,""left"":0.044061530761221...",2880108067130,0074801108/2053440150
3,004.jpg,"{""shape"":""rectangle""\,""left"":0.045632442846534...",2880108430682,0074837996/2053658941
4,005.jpg,"{""shape"":""rectangle""\,""left"":0.035318245138288...",2880108430712,0074837999/2053659636


In [11]:
df.shape

(44, 4)

## Подготовка данных в датасете

### Подготовка данных в столбце 'filename'

In [12]:
df['filename'].unique()

array(['001.jpg', '002.jpg', '003.jpg', '004.jpg', '005.jpg', '006.jpg',
       '007.jpg', '008.jpg', '009.jpg', '010.jpg', '011.jpg', '012.jpg',
       '013.jpg', '014.jpg', '015.jpg', '016.jpg', '017.jpg', '018.jpg',
       '019.jpg', '020.jpg', '021.jpg', '022.jpg', '023.jpg', '024.jpg',
       '025.jpg', '026.jpg', '027.jpg', '028.jpg', '029.jpg', '030.jpg',
       '031.jpg', '032.jpg', '033.jpg', '034.jpg', '035.jpg', '036.jpg',
       '037.jpg', '038.jpg', '039.jpg', '040.jpg', '041.jpg', '042.jpg',
       '043.jpg', '044.jpg'], dtype=object)

Данные готовы

### Подготовка данных в столбце 'barcode_number'

In [13]:
df['barcode_number'].unique()

array([ 2880107901381,  2880108066928,  2880108067130,  2880108430682,
        2880108430712,   880109787082,  2880110095657,  2880110225252,
        2880110278562,  2880110335722,  2880110326782,  2880110673473,
        2880110661296,  2880110724779,        2880110,  2880111337145,
        2880111746954,  2880112046077,  2880112090308,  2880112239561,
        2880112181051,  2880112587204,  2880112712774,  2880112962339,
        2880113182347,  2880113157475,  2880113340853,  2880113356434,
        2880113374926, 28801133710792,  2880113371086,  2880113469219,
        2880113487794,  2880113560275,   288011387873,  2880113915402,
        2880113901238,  2880113999044,  2880113990782,  2880114037332,
        2880114032191,  2880114059839,  2880114665719,  2880114688664])

Нужно преобразовать некоторые номера штрих-кодов.

In [14]:
df['barcode_number'] = df['barcode_number'].astype('str')

In [15]:
df[df['barcode_number'].str.len() !=13]

Unnamed: 0,filename,rectangles,barcode_number,doc_number
5,006.jpg,"{""shape"":""rectangle""\,""left"":0.043080328511974...",880109787082,0074955703/2054229575
14,015.jpg,"{""shape"":""rectangle""\,""left"":0.040457286871261...",2880110,0075059242/2054752726
29,030.jpg,"{""shape"":""rectangle""\,""left"":0.032497078094806...",28801133710792,0075279413/2056283563
34,035.jpg,"{""shape"":""rectangle""\,""left"":0.041800047271807...",288011387873,0075325667/2056558356


In [16]:
df['barcode_number'][5] = '2880109787082'
df['barcode_number'][5]

'2880109787082'

In [17]:
df['barcode_number'][14] = '2880110916051'
df['barcode_number'][14]

'2880110916051'

In [18]:
df['barcode_number'][29] = '2880113371079'
df['barcode_number'][29]

'2880113371079'

In [19]:
df['barcode_number'][34] = '2880113878783'
df['barcode_number'][34]

'2880113878783'

In [20]:
df[df['barcode_number'].str.len() !=13]

Unnamed: 0,filename,rectangles,barcode_number,doc_number


Все номера тринадцатизначной длинны.

### Подготовка данных в столбце 'doc_number'

In [21]:
df[df['doc_number'].str.len() != 21]

Unnamed: 0,filename,rectangles,barcode_number,doc_number
11,012.jpg,"{""shape"":""rectangle""\,""left"":0.042099142399997...",2880110673473,0075036788/205464798
12,013.jpg,"{""shape"":""rectangle""\,""left"":0.056950961348970...",2880110661296,0075034232/20546700
13,014.jpg,"{""shape"":""rectangle""\,""left"":0.056784085403379...",2880110724779,0075042351/205470639
32,033.jpg,"{""shape"":""rectangle""\,""left"":0.041433587658181...",2880113487794,007591081/2056342193


#### Проверим разметку.

На фото 012.jpg номер документа реально меньше на одну цифру, чем на других фото.

На фото 013.jpg номер документа реально меньше на две цифры, чем на других фото.

На фото 014.jpg номер документа реально меньше на одну цифру, чем на других фото.

In [22]:
df['doc_number'][32] = '0075291081/2056342193'
df['doc_number'][32]

'0075291081/2056342193'

Для фото 033.jpg была исправлена ошибка в номере документа.

In [23]:
df[df['doc_number'].str.len() != 21]

Unnamed: 0,filename,rectangles,barcode_number,doc_number
11,012.jpg,"{""shape"":""rectangle""\,""left"":0.042099142399997...",2880110673473,0075036788/205464798
12,013.jpg,"{""shape"":""rectangle""\,""left"":0.056950961348970...",2880110661296,0075034232/20546700
13,014.jpg,"{""shape"":""rectangle""\,""left"":0.056784085403379...",2880110724779,0075042351/205470639


Данные в столбце doc_number готовы.

### Парсинг столбца rectangles

In [24]:
rec = df['rectangles'][0]

In [25]:
rec = eval(rec.replace('\\', ''))

In [26]:
rec

({'shape': 'rectangle',
  'left': 0.03689105676754057,
  'top': 0.04137304301448413,
  'width': 0.1822116669205581,
  'height': 0.05197405707743504,
  'label': 'barcode'},
 {'shape': 'rectangle',
  'left': 0.16763519134607313,
  'top': 0.0974580424596259,
  'width': 0.08571004377926023,
  'height': 0.01908651813578123,
  'label': 'doc-name'},
 {'shape': 'rectangle',
  'left': 0.3612610287457095,
  'top': 0.17674050240825565,
  'width': 0.10376518617343855,
  'height': 0.016443769470826902,
  'label': 'doc-number'})

In [27]:
example = pd.DataFrame(rec).set_index('label').drop('shape', axis=1).T.unstack().to_frame().T

In [28]:
example

label,barcode,barcode,barcode,barcode,doc-name,doc-name,doc-name,doc-name,doc-number,doc-number,doc-number,doc-number
Unnamed: 0_level_1,left,top,width,height,left,top,width,height,left,top,width,height
0,0.036891,0.041373,0.182212,0.051974,0.167635,0.097458,0.08571,0.019087,0.361261,0.176741,0.103765,0.016444


In [29]:
example.columns

MultiIndex([(   'barcode',   'left'),
            (   'barcode',    'top'),
            (   'barcode',  'width'),
            (   'barcode', 'height'),
            (  'doc-name',   'left'),
            (  'doc-name',    'top'),
            (  'doc-name',  'width'),
            (  'doc-name', 'height'),
            ('doc-number',   'left'),
            ('doc-number',    'top'),
            ('doc-number',  'width'),
            ('doc-number', 'height')],
           names=['label', None])

In [30]:
example.columns = ['_'.join(col) for col in example.columns]
example.columns

Index(['barcode_left', 'barcode_top', 'barcode_width', 'barcode_height',
       'doc-name_left', 'doc-name_top', 'doc-name_width', 'doc-name_height',
       'doc-number_left', 'doc-number_top', 'doc-number_width',
       'doc-number_height'],
      dtype='object')

In [31]:
'_'.join(example.columns[0].split('_')[:2])

'barcode_left'

In [32]:
new_columns_list = []

In [33]:
for col in example.columns:
#     new_columns_list.append('_'.join(col.split('_')[:1]) + '_box_' + col.split('_')[-1])
    new_columns_list.append('_'.join(col.split('_')[:1]) + '_' +col.split('_')[-1])
#     print(col)

In [34]:
new_columns_list

['barcode_left',
 'barcode_top',
 'barcode_width',
 'barcode_height',
 'doc-name_left',
 'doc-name_top',
 'doc-name_width',
 'doc-name_height',
 'doc-number_left',
 'doc-number_top',
 'doc-number_width',
 'doc-number_height']

In [35]:
example.columns = new_columns_list

In [36]:
example.columns = example.columns.str.replace('-', '_')

In [37]:
example.columns

Index(['barcode_left', 'barcode_top', 'barcode_width', 'barcode_height',
       'doc_name_left', 'doc_name_top', 'doc_name_width', 'doc_name_height',
       'doc_number_left', 'doc_number_top', 'doc_number_width',
       'doc_number_height'],
      dtype='object')

In [38]:
example

Unnamed: 0,barcode_left,barcode_top,barcode_width,barcode_height,doc_name_left,doc_name_top,doc_name_width,doc_name_height,doc_number_left,doc_number_top,doc_number_width,doc_number_height
0,0.036891,0.041373,0.182212,0.051974,0.167635,0.097458,0.08571,0.019087,0.361261,0.176741,0.103765,0.016444


In [39]:
rectangles_list = []

In [40]:
for rectangles in df['rectangles']:
    rectangles = eval(rectangles.replace('\\', ''))
    rectangles = pd.DataFrame(rectangles).set_index('label').drop('shape', axis=1).T.unstack().to_frame().T
    
    rectangles.columns = ['_'.join(col) for col in rectangles.columns]
    
    new_columns_list = []
    for col in rectangles.columns:
#         new_columns_list.append('_'.join(col.split('_')[:2]) + '_box_' + col.split('_')[-1])
        new_columns_list.append('_'.join(col.split('_')[:1]) + '_box_' + col.split('_')[-1])
    
    rectangles.columns = new_columns_list
    
    rectangles.columns = rectangles.columns.str.replace('-', '_')
    rectangles.columns = rectangles.columns.str.replace('name', 'type')
    rectangles_list.append(rectangles)

In [41]:
boxes_table = pd.concat(rectangles_list, ignore_index=True)

In [42]:
boxes_table.head(10)

Unnamed: 0,barcode_box_left,barcode_box_top,barcode_box_width,barcode_box_height,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height
0,0.036891,0.041373,0.182212,0.051974,0.167635,0.097458,0.08571,0.019087,0.361261,0.176741,0.103765,0.016444
1,0.034493,0.035611,0.182089,0.049772,0.167335,0.090945,0.084423,0.016688,0.359356,0.170287,0.104494,0.01581
2,0.044062,0.030233,0.181714,0.054957,0.174441,0.088082,0.086541,0.020247,0.366604,0.170679,0.104713,0.020247
3,0.045632,0.027665,0.1833,0.055418,0.176841,0.085854,0.085775,0.018565,0.370715,0.167041,0.103204,0.017457
4,0.035318,0.037956,0.187007,0.055973,0.16928,0.096473,0.08721,0.02205,0.363179,0.173648,0.107289,0.021202
5,0.04308,0.024237,0.186532,0.05476,0.177269,0.082589,0.087239,0.020199,0.372093,0.166081,0.104088,0.016651
6,0.056558,0.063214,0.176631,0.046913,0.184158,0.114677,0.083593,0.016775,0.37164,0.189169,0.101879,0.0145
7,0.03372,0.038058,0.180761,0.050746,0.164449,0.093879,0.08518,0.016746,0.358659,0.171521,0.10383,0.016746
8,0.057411,0.063246,0.174496,0.049791,0.1834,0.117894,0.082848,0.016394,0.369361,0.194321,0.098513,0.013955
9,0.035202,0.042186,0.181278,0.049887,0.165755,0.096309,0.085483,0.017884,0.359174,0.17608,0.104443,0.014119


In [43]:
df = df.join(boxes_table)

In [44]:
df.head()

Unnamed: 0,filename,rectangles,barcode_number,doc_number,barcode_box_left,barcode_box_top,barcode_box_width,barcode_box_height,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height
0,001.jpg,"{""shape"":""rectangle""\,""left"":0.036891056767540...",2880107901381,0074786723/2053332022,0.036891,0.041373,0.182212,0.051974,0.167635,0.097458,0.08571,0.019087,0.361261,0.176741,0.103765,0.016444
1,002.jpg,"{""shape"":""rectangle""\,""left"":0.034492666106048...",2880108066928,0074801020/2053439854,0.034493,0.035611,0.182089,0.049772,0.167335,0.090945,0.084423,0.016688,0.359356,0.170287,0.104494,0.01581
2,003.jpg,"{""shape"":""rectangle""\,""left"":0.044061530761221...",2880108067130,0074801108/2053440150,0.044062,0.030233,0.181714,0.054957,0.174441,0.088082,0.086541,0.020247,0.366604,0.170679,0.104713,0.020247
3,004.jpg,"{""shape"":""rectangle""\,""left"":0.045632442846534...",2880108430682,0074837996/2053658941,0.045632,0.027665,0.1833,0.055418,0.176841,0.085854,0.085775,0.018565,0.370715,0.167041,0.103204,0.017457
4,005.jpg,"{""shape"":""rectangle""\,""left"":0.035318245138288...",2880108430712,0074837999/2053659636,0.035318,0.037956,0.187007,0.055973,0.16928,0.096473,0.08721,0.02205,0.363179,0.173648,0.107289,0.021202


### Проверка join

In [45]:
for idx, num in enumerate(df['doc_type_box_left']):
    if str(num) not in df['rectangles'][idx]:
        print(idx, 'NOT ok')

### Готовый датасет

In [46]:
df = df.drop('rectangles', axis=1)

In [47]:
df.head(10)

Unnamed: 0,filename,barcode_number,doc_number,barcode_box_left,barcode_box_top,barcode_box_width,barcode_box_height,doc_type_box_left,doc_type_box_top,doc_type_box_width,doc_type_box_height,doc_number_box_left,doc_number_box_top,doc_number_box_width,doc_number_box_height
0,001.jpg,2880107901381,0074786723/2053332022,0.036891,0.041373,0.182212,0.051974,0.167635,0.097458,0.08571,0.019087,0.361261,0.176741,0.103765,0.016444
1,002.jpg,2880108066928,0074801020/2053439854,0.034493,0.035611,0.182089,0.049772,0.167335,0.090945,0.084423,0.016688,0.359356,0.170287,0.104494,0.01581
2,003.jpg,2880108067130,0074801108/2053440150,0.044062,0.030233,0.181714,0.054957,0.174441,0.088082,0.086541,0.020247,0.366604,0.170679,0.104713,0.020247
3,004.jpg,2880108430682,0074837996/2053658941,0.045632,0.027665,0.1833,0.055418,0.176841,0.085854,0.085775,0.018565,0.370715,0.167041,0.103204,0.017457
4,005.jpg,2880108430712,0074837999/2053659636,0.035318,0.037956,0.187007,0.055973,0.16928,0.096473,0.08721,0.02205,0.363179,0.173648,0.107289,0.021202
5,006.jpg,2880109787082,0074955703/2054229575,0.04308,0.024237,0.186532,0.05476,0.177269,0.082589,0.087239,0.020199,0.372093,0.166081,0.104088,0.016651
6,007.jpg,2880110095657,0074983497/2054390170,0.056558,0.063214,0.176631,0.046913,0.184158,0.114677,0.083593,0.016775,0.37164,0.189169,0.101879,0.0145
7,008.jpg,2880110225252,0074995162/2054442486,0.03372,0.038058,0.180761,0.050746,0.164449,0.093879,0.08518,0.016746,0.358659,0.171521,0.10383,0.016746
8,009.jpg,2880110278562,0075001744/2054475227,0.057411,0.063246,0.174496,0.049791,0.1834,0.117894,0.082848,0.016394,0.369361,0.194321,0.098513,0.013955
9,010.jpg,2880110335722,0075005620/2054494573,0.035202,0.042186,0.181278,0.049887,0.165755,0.096309,0.085483,0.017884,0.359174,0.17608,0.104443,0.014119


## Проверка на изображениях

In [52]:
mapping_prefix_value = {
    'barcode_box_': 'barcode_number',
    'doc_type_box_': 'doctype',
    'doc_number_box_': 'doc_number'
}

In [54]:
for index, row in df.iterrows():
    if index > 1:
        image = cv2.imread('/home/kirillk/datasets/OCR/ocr_ttn/jpg/first_pages/' + row['filename'])
        image = cv2.resize(image, (0, 0), fx=0.6, fy=0.6)
        height, width, _ = image.shape

        for prefix, value in mapping_prefix_value.items():
            x1 = int(row[prefix + 'left'] * width)
            y1 = int(row[prefix + 'top'] * height)
            x2 = int((row[prefix + 'left'] + row[prefix + 'width']) * width)
            y2 = int((row[prefix + 'top'] + row[prefix + 'height']) * height)

            image = cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 4)
            image = cv2.putText(image, 
                                str(row.get(value, default='')), 
                                (x1, y2 + 70),
                                cv2.FONT_HERSHEY_SIMPLEX,
                                #0,
                                1,
                                (0, 0, 255),
                                2)

            cv2.imshow('Document', image)

            key = cv2.waitKey(0) & 0xFF

            if key == ord('q'):
                break
    
    cv2.destroyAllWindows()
    if index == 5:
        break

In [55]:
for index, row in df.iterrows():
    image = cv2.imread('/home/kirillk/datasets/OCR/ocr_ttn/jpg/first_pages/' + row['filename'])
    image = cv2.resize(image, (0, 0), fx=0.6, fy=0.6)
    height, width, _ = image.shape

    for prefix, value in mapping_prefix_value.items():
        x1 = int(row[prefix + 'left'] * width)
        y1 = int(row[prefix + 'top'] * height)
        x2 = int((row[prefix + 'left'] + row[prefix + 'width']) * width)
        y2 = int((row[prefix + 'top'] + row[prefix + 'height']) * height)

        image = cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 4)
        image = cv2.putText(image, 
                            str(row.get(value, default='')), 
                            (x1, y2 + 70),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            #0,
                            1,
                            (0, 0, 255),
                            2)

        cv2.imshow('Document', image)

        key = cv2.waitKey(0) & 0xFF

        if key == ord('q'):
            break
    
    cv2.destroyAllWindows()

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   filename               44 non-null     object 
 1   barcode_number         44 non-null     object 
 2   doc_number             44 non-null     object 
 3   barcode_box_left       44 non-null     float64
 4   barcode_box_top        44 non-null     float64
 5   barcode_box_width      44 non-null     float64
 6   barcode_box_height     44 non-null     float64
 7   doc_type_box_left      44 non-null     float64
 8   doc_type_box_top       44 non-null     float64
 9   doc_type_box_width     44 non-null     float64
 10  doc_type_box_height    44 non-null     float64
 11  doc_number_box_left    44 non-null     float64
 12  doc_number_box_top     44 non-null     float64
 13  doc_number_box_width   44 non-null     float64
 14  doc_number_box_height  44 non-null     float64
dtypes: float

## Сохранение датасета

In [57]:
df.to_csv('/home/kirillk/datasets/OCR/ocr_ttn/test-cases-ttn.csv')

In [58]:
test = pd.read_csv('/home/kirillk/datasets/OCR/ocr_ttn/test-cases-ttn.csv')

In [59]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             44 non-null     int64  
 1   filename               44 non-null     object 
 2   barcode_number         44 non-null     int64  
 3   doc_number             44 non-null     object 
 4   barcode_box_left       44 non-null     float64
 5   barcode_box_top        44 non-null     float64
 6   barcode_box_width      44 non-null     float64
 7   barcode_box_height     44 non-null     float64
 8   doc_type_box_left      44 non-null     float64
 9   doc_type_box_top       44 non-null     float64
 10  doc_type_box_width     44 non-null     float64
 11  doc_type_box_height    44 non-null     float64
 12  doc_number_box_left    44 non-null     float64
 13  doc_number_box_top     44 non-null     float64
 14  doc_number_box_width   44 non-null     float64
 15  doc_numb