In [1]:
import os
import glob
import json
import re
from PIL import Image

import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
def show_files(directory):
    arr = os.listdir(directory)
    arr = [item for item in arr if '.' not in item[0]] # hide hidden directories
    print(directory, "contains:", arr)
    return arr

In [3]:
# location of USB drive on Mac
usb_dir = "/Volumes/DATASETS" 

show_files(usb_dir)
print('\n')

show_files(usb_dir + '/COCO-text')
print('\n')

arr = show_files(usb_dir + '/textOCR')
for item in arr:
    show_files(usb_dir + '/textOCR/' + item) 

/Volumes/DATASETS contains: ['textOCR', 'COCO-text']


/Volumes/DATASETS/COCO-text contains: ['cocotext.v2.json', 'train2014']


/Volumes/DATASETS/textOCR contains: ['test', 'train + validation']
/Volumes/DATASETS/textOCR/test contains: ['test_images']
/Volumes/DATASETS/textOCR/train + validation contains: ['TextOCR_0.1_train.json', 'train', 'TextOCR_0.1_val.json']


In [4]:
data = json.load(open(usb_dir + '/COCO-text/cocotext.v2.json')) # Get JSON data
annotations = data['anns'].values() # Get only the annotations as dictionary

coco = pd.DataFrame(annotations) # Convert dictionary to dataframe
coco.head()

Unnamed: 0,mask,class,bbox,image_id,id,language,area,utf8_string,legibility
0,"[468.9, 286.7, 468.9, 295.2, 493.0, 295.8, 493...",machine printed,"[468.9, 286.7, 24.1, 9.1]",217925,45346,english,206.06,New,legible
1,"[344.5, 261.5, 348.1, 261.5, 348.2, 263.4, 344...",machine printed,"[344.5, 261.5, 3.7, 1.9]",483569,153036,english,6.93,,illegible
2,"[362.4, 280.9, 359.2, 286.3, 367.1, 291.0, 369...",machine printed,"[359.2, 280.9, 10.7, 10.1]",417153,125303,english,57.09,,illegible
3,"[570.9, 9.6, 570.9, 14.3, 557.1, 14.3, 556.8, ...",machine printed,"[556.8, 9.4, 14.1, 4.9]",15451,21639,english,66.94,,illegible
4,"[489.4, 131.8, 502.7, 135.6, 502.7, 156.4, 493...",machine printed,"[489.4, 131.8, 13.3, 24.6]",379024,112792,english,249.08,W,legible


In [5]:
len(coco)

201126

In [6]:
data = json.load(open(usb_dir + '/textOCR/train + validation/TextOCR_0.1_train.json'))
annotations = data['anns'].values()

ocr_train = pd.DataFrame(annotations)
ocr_train.head()

Unnamed: 0,id,image_id,bbox,utf8_string,points,area
0,a4ea732cd3d5948a_1,a4ea732cd3d5948a,"[525.83, 3.4, 197.64, 33.94]",Performance,"[525.83, 3.4, 723.47, 7.29, 722.76, 36.99, 525...",6707.9
1,a4ea732cd3d5948a_2,a4ea732cd3d5948a,"[534.67, 64.68, 91.22, 38.19]",Sport,"[535.73, 64.68, 623.41, 67.51, 625.89, 102.87,...",3483.69
2,a4ea732cd3d5948a_3,a4ea732cd3d5948a,"[626.95, 63.62, 96.52, 31.82]",Watch,"[626.95, 63.62, 721.7, 63.62, 723.47, 95.44, 6...",3071.27
3,a4ea732cd3d5948a_4,a4ea732cd3d5948a,"[577.4, 141.87, 147.13, 43.1]",...period.,"[580.02, 143.61, 724.53, 141.87, 723.66, 184.9...",6341.3
4,a4ea732cd3d5948a_5,a4ea732cd3d5948a,"[391.03, 163.9, 60.82, 38.65]",.,"[395.2, 163.9, 451.85, 191.94, 445.59, 202.55,...",2350.69


In [7]:
data = json.load(open(usb_dir + '/textOCR/train + validation/TextOCR_0.1_val.json'))
annotations = data['anns'].values()

ocr_val = pd.DataFrame(annotations)
ocr_val.head()

Unnamed: 0,id,image_id,bbox,utf8_string,points,area
0,a7ad2bcb93d48576_1,a7ad2bcb93d48576,"[76.73, 63.84, 141.41, 30.66]",RICHARD,"[77.3, 63.84, 217.0, 64.4, 218.14, 94.5, 76.73...",4335.63
1,a7ad2bcb93d48576_2,a7ad2bcb93d48576,"[41.53, 93.93, 206.14, 60.2]",moRGAn,"[42.09, 93.93, 245.39, 93.93, 247.67, 152.99, ...",12409.63
2,a7ad2bcb93d48576_3,a7ad2bcb93d48576,"[39.82, 183.09, 207.28, 75.53]",ALTERED,"[42.66, 186.5, 241.99, 183.09, 247.1, 258.62, ...",15655.86
3,a7ad2bcb93d48576_4,a7ad2bcb93d48576,"[41.53, 251.24, 208.98, 88.59]",CARBOn,"[42.09, 251.24, 245.96, 251.24, 250.51, 335.28...",18513.54
4,a7ad2bcb93d48576_5,a7ad2bcb93d48576,"[46.46, 158.99, 16.73, 9.74]",'An,"[46.46, 158.99, 61.98, 159.3, 63.19, 168.73, 4...",162.95


In [8]:
# Merge OCR dataframes
ocr = pd.concat([ocr_val, ocr_train])
len(ocr)

1202339

In [9]:
# For merging dataframes

# Source identifier for easier image location (can be dropped if unnecessary)
# coco['src'] = 'COCO-Text'
# ocr['src'] = 'TextOCR'

# Give TextOCR language tag (assuming all data is English)
ocr['language'] = 'english'

# Drop all rows where the text is unidentifiable (also makes legibility redundant)
coco = coco[coco['legibility'] == 'legible']
coco.drop(columns=['legibility'], inplace=True)
ocr = ocr[ocr['utf8_string'] != '.']

# Drop class; being machine printed or not is irrelevant
coco.drop(columns=['class'], inplace=True)

# Rename columns to match
ocr.rename(columns = {'points':'mask'}, inplace=True)

# Reorder columns
cols = ['id', 'image_id', 'mask', 'bbox', 'utf8_string', 'language', 'area']
coco = coco[cols]
ocr = ocr[cols]

In [10]:
coco.head()

Unnamed: 0,id,image_id,mask,bbox,utf8_string,language,area
0,45346,217925,"[468.9, 286.7, 468.9, 295.2, 493.0, 295.8, 493...","[468.9, 286.7, 24.1, 9.1]",New,english,206.06
4,112792,379024,"[489.4, 131.8, 502.7, 135.6, 502.7, 156.4, 493...","[489.4, 131.8, 13.3, 24.6]",W,english,249.08
6,8231,122908,"[577.8, 148.3, 601.7, 149.1, 611.0, 196.2, 575...","[575.7, 148.3, 35.3, 47.9]",NAVY,english,1366.38
13,175298,544815,"[398.7, 16.5, 385.0, 18.2, 387.1, 48.7, 401.7,...","[385.0, 16.5, 16.7, 32.2]",6,english,426.05
15,125471,417556,"[385.2, 53.8, 383.9, 66.2, 393.8, 67.1, 393.8,...","[383.9, 53.8, 17.9, 13.3]",$189,english,162.18


In [11]:
ocr.head()

Unnamed: 0,id,image_id,mask,bbox,utf8_string,language,area
0,a7ad2bcb93d48576_1,a7ad2bcb93d48576,"[77.3, 63.84, 217.0, 64.4, 218.14, 94.5, 76.73...","[76.73, 63.84, 141.41, 30.66]",RICHARD,english,4335.63
1,a7ad2bcb93d48576_2,a7ad2bcb93d48576,"[42.09, 93.93, 245.39, 93.93, 247.67, 152.99, ...","[41.53, 93.93, 206.14, 60.2]",moRGAn,english,12409.63
2,a7ad2bcb93d48576_3,a7ad2bcb93d48576,"[42.66, 186.5, 241.99, 183.09, 247.1, 258.62, ...","[39.82, 183.09, 207.28, 75.53]",ALTERED,english,15655.86
3,a7ad2bcb93d48576_4,a7ad2bcb93d48576,"[42.09, 251.24, 245.96, 251.24, 250.51, 335.28...","[41.53, 251.24, 208.98, 88.59]",CARBOn,english,18513.54
4,a7ad2bcb93d48576_5,a7ad2bcb93d48576,"[46.46, 158.99, 61.98, 159.3, 63.19, 168.73, 4...","[46.46, 158.99, 16.73, 9.74]",'An,english,162.95


In [12]:
# Finally merge the two dataframes
df = pd.concat([coco, ocr])

# Shuffle the rows
df = df.sample(frac = 1)
df.head(20)

Unnamed: 0,id,image_id,mask,bbox,utf8_string,language,area
880291,013547912a1c0427_25,013547912a1c0427,"[669.47, 630.92, 710.57, 631.13, 711.42, 643.2...","[669.47, 630.92, 41.95, 12.28]",Austin,english,515.15
723071,6efbee668e2de1ac_30,6efbee668e2de1ac,"[252.29, 610.76, 292.0, 606.37, 297.46, 626.07...","[252.29, 606.37, 45.17, 25.92]",533,english,1170.81
487585,d6997ab8b72ff813_54,d6997ab8b72ff813,"[371.04, 670.17, 449.54, 670.17, 449.54, 697.3...","[371.04, 670.17, 78.5, 27.17]",CONCHA,english,2132.85
377773,a3f187813c0f4280_14,a3f187813c0f4280,"[765.13, 99.94, 764.78, 54.93, 792.61, 55.62, ...","[764.78, 54.93, 28.17, 46.04]",COUTER,english,1296.95
532166,42dab293ff2cd065_69,42dab293ff2cd065,"[0.06, 43.33, 121.81, 58.08, 121.81, 92.58, 0....","[0.06, 43.33, 121.75, 49.25]",Drowning,english,5996.19
877432,e70996b2a6edbb4e_9,e70996b2a6edbb4e,"[545.44, 514.17, 559.87, 513.65, 560.13, 534.8...","[545.18, 513.65, 14.95, 21.51]",S,english,321.57
24340,08be3964ac87c44e_11,08be3964ac87c44e,"[192.99, 223.53, 200.56, 329.86, 186.45, 329.8...","[177.5, 223.53, 23.06, 106.33]",FROSTBITE,english,2451.97
72896,17339,144091,"[56.3, 227.2, 65.9, 236.2, 73.6, 226.6, 85.8, ...","[56.3, 200.3, 119.1, 35.9]",FINCHINGFIELD,english,1547.08
627992,6c6f0148f5759ecd_8,6c6f0148f5759ecd,"[612.52, 552.04, 637.95, 466.9, 675.55, 481.28...","[612.52, 466.9, 63.03, 101.72]",AND,english,6411.41
177333,123551,412575,"[80.8, 228.1, 96, 226.5, 94.4, 232.9, 81.6, 23...","[80.8, 226.5, 15.2, 6.4]",DB,english,78.08


In [13]:
def show_image (image_id):

    coco_dir = '/COCO-text/train2014/COCO_train2014_'
    ocr_dir = '/textOCR/train + validation/train/'
    
    if re.search('[a-zA-Z]', str(image_id)) == None: # COCO-text image ID is an integer!
        image = Image.open(usb_dir + coco_dir + '{}.jpg'.zfill(12).format(image_id))
    else: # TextOCR image ID is a string!
        image = Image.open(usb_dir + ocr_dir + '{}.jpg'.format(image_id))
        
    image.show()

In [14]:
show_image (217925)

In [15]:
show_image ('a7ad2bcb93d48576')