In [None]:
from PIL import Image, ImageDraw, ImageFont
from os import listdir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

First, in order to visualise the dataset, we need a font that can display the full range of Japanese characters. We're using [Noto Sans](https://en.wikipedia.org/wiki/Noto_fonts), an open source font by Google which can display very almost all the characters used within this competition.

In [None]:
fontsize = 50

# From https://www.google.com/get/noto/
!wget -q --show-progress https://noto-website-2.storage.googleapis.com/pkgs/NotoSansCJKjp-hinted.zip
!unzip -p NotoSansCJKjp-hinted.zip NotoSansCJKjp-Regular.otf > NotoSansCJKjp-Regular.otf
!rm NotoSansCJKjp-hinted.zip

font = ImageFont.truetype('./NotoSansCJKjp-Regular.otf', fontsize, encoding='utf-8')

# Visualising the training data
You'll notice that some of the characters "off to the side" of 
columns in the text aren't annotated in the training set. 
These characters are annotations and not part of the main text of the books, 
so they shouldn't be transcribed by your model.

In [None]:
df_train = pd.read_csv('../input/train.csv')
unicode_map = {codepoint: char for codepoint, char in pd.read_csv('../input/unicode_translation.csv').values}

In [None]:
# This function takes in a filename of an image, and the labels in the string format given in train.csv, and returns an image containing the bounding boxes and characters annotated
def visualize_training_data(image_fn, labels):
    # Convert annotation string to array
    labels = np.array(labels.split(' ')).reshape(-1, 5)
    
    # Read image
    imsource = Image.open(image_fn).convert('RGBA')
    bbox_canvas = Image.new('RGBA', imsource.size)
    char_canvas = Image.new('RGBA', imsource.size)
    bbox_draw = ImageDraw.Draw(bbox_canvas) # Separate canvases for boxes and chars so a box doesn't cut off a character
    char_draw = ImageDraw.Draw(char_canvas)

    for codepoint, x, y, w, h in labels:
        x, y, w, h = int(x), int(y), int(w), int(h)
        char = unicode_map[codepoint] # Convert codepoint to actual unicode character

        # Draw bounding box around character, and unicode character next to it
        bbox_draw.rectangle((x, y, x+w, y+h), fill=(255, 255, 255, 0), outline=(255, 0, 0, 255))
        char_draw.text((x + w + fontsize/4, y + h/2 - fontsize), char, fill=(0, 0, 255, 255), font=font)

    imsource = Image.alpha_composite(Image.alpha_composite(imsource, bbox_canvas), char_canvas)
    imsource = imsource.convert("RGB") # Remove alpha for saving in jpg format.
    return np.asarray(imsource)

In [None]:
np.random.seed(1337)

for i in range(10):
    img, labels = df_train.values[np.random.randint(len(df_train))]
    viz = visualize_training_data('../input/train_images/{}.jpg'.format(img), labels)
    
    plt.figure(figsize=(15, 15))
    plt.title(img)
    plt.imshow(viz, interpolation='lanczos')
    plt.show()

# Visualising predictions
For the test set, 
you're only required to predict a single point within each bounding box instead of the entire bounding box 
(ideally, the centre of the bounding box). It may also be useful to visualise the box centres on the image:

In [None]:
# This function takes in a filename of an image, and the labels in the string format given in a submission csv, and returns an image with the characters and predictions annotated.
def visualize_predictions(image_fn, labels):
    # Convert annotation string to array
    labels = np.array(labels.split(' ')).reshape(-1, 3)
    
    # Read image
    imsource = Image.open(image_fn).convert('RGBA')
    bbox_canvas = Image.new('RGBA', imsource.size)
    char_canvas = Image.new('RGBA', imsource.size)
    bbox_draw = ImageDraw.Draw(bbox_canvas) # Separate canvases for boxes and chars so a box doesn't cut off a character
    char_draw = ImageDraw.Draw(char_canvas)

    for codepoint, x, y in labels:
        x, y = int(x), int(y)
        char = unicode_map[codepoint] # Convert codepoint to actual unicode character

        # Draw bounding box around character, and unicode character next to it
        bbox_draw.rectangle((x-10, y-10, x+10, y+10), fill=(255, 0, 0, 255))
        char_draw.text((x+25, y-fontsize*(3/4)), char, fill=(255, 0, 0, 255), font=font)

    imsource = Image.alpha_composite(Image.alpha_composite(imsource, bbox_canvas), char_canvas)
    imsource = imsource.convert("RGB") # Remove alpha for saving in jpg format.
    return np.asarray(imsource)

In [None]:
image_fn = '/disk2/zhaoliang/datasets/Kuzushiji/train_images/hnsd006-026.jpg'
pred_string = 'U+3066 1082.4912944592927 1551.2399131373356 U+304D 516.5585166529605 1867.290472733347 U+304D 516.6431949013158 1866.7659076891448 U+305D 334.1691830283717 1854.9404425370067 U+3082 902.0873380962171 788.5131193462171 U+3066 709.6436831825658 720.9947445518092 U+5FC3 1471.442935341283 1403.5103567023027 U+3082 902.5978008069491 788.463712993421 U+5FC3 905.5811189350329 680.8372738486842 U+6D25 715.0991339432566 1467.908357319079 U+3061 716.0733835320724 1326.5323679070723 U+306E 338.72169896175984 1580.6814093338817 U+706B 568.5356702302631 1289.1267475328948 U+3066 1082.7045641447369 1551.2111944901317 U+7D05 1461.4539859169408 1895.2937718441613 U+5FC3 905.5069612201892 680.9964792351974 U+7D05 1461.4731316817433 1895.5135947779606 U+7ACB 515.836824115954 787.9830771998355 U+3082 511.7076351768092 340.48397666529604 U+4ECB 1086.6103001644738 1673.3147229646381 U+9CE5 332.87880345394734 800.802580180921 U+6CBB 568.228759765625 2349.2572985197367 U+3082 1276.5168842516448 2019.1836708470394 U+30CF 704.661287006579 1152.8757195723683 U+3092 333.3825041118421 1318.0553877981085 U+7D66 710.8341899671053 2246.4308568050988 U+30A6 1274.0473375822369 1926.1269659745067 U+885B 715.800138774671 1054.3504574424342 U+305D 334.796142578125 1856.2845491108142 U+306E 909.4413998252467 328.87303402549344 U+3082 1087.7720883018092 425.33938759251646 U+3066 514.1645572060033 1184.2181396484375 U+6CBB 706.4490388569079 843.2766241776316 U+4ECB 1087.2773180509869 1673.216616981908 U+6301 513.8984439247533 1583.1255139802631 U+5C0F 699.2000539679276 2406.0721628289475 U+3042 1276.0750539679277 2130.614142166941 U+8A00 710.7784873560855 2044.0163702713817 U+3082 1276.2367007606908 2019.9739476254113 U+7406 1093.5524709601152 698.4517629523026 U+7533 518.9016884251645 1978.0189915707238 U+3082 1088.2112201891448 425.2938682154605 U+30CF 704.2988024259869 1153.6429636101973 U+3068 717.8103155838816 1930.2616159539473 U+7CF8 908.9564273231908 460.125732421875 U+3066 709.3583839818051 721.5186189350329 U+4EBA 1094.5570775082238 913.0756578947369 U+7533 518.7522486636514 1978.4248753597863 U+3064 516.0226279810855 1779.8088635896381 U+3061 715.8499065198397 1326.4530222039473 U+306E 910.2247378700658 328.9093017578125 U+6885 138.2958341899671 357.03182822779604 U+7406 1093.0990439967106 698.1690738075658 U+3042 519.0131900185033 903.4623637952303 U+4EBA 1095.26123046875 913.0814401726974 U+65AF 904.6397640830592 911.5132221422697 U+75C5 1277.2453870271381 643.1733141447369 U+306A 1097.6574385793585 1783.0656995271381 U+7FA9 1092.3650159333881 564.152189555921 U+3089 335.6881553248355 651.444091796875 U+65AF 1460.2524928042762 2018.9179430509869 U+308A 519.2234079461349 645.7797723067434 U+5916 1467.5666246916119 1183.7703022203948 U+591A 893.3047003495066 1741.414730674342 U+3068 1476.2702379728619 1057.6042737458881 U+30A6 1273.9613743832238 1924.2927631578948 U+885B 717.5569233141447 1054.219890393709 U+308B 1474.9799547697369 613.3573833264803 U+9B5A 893.1229280170642 1738.819644325658 U+305E 896.1956787109375 2500.6577019942433 U+898B 142.00025056537828 2116.154656661184 U+5916 1466.940853721217 1185.9384315892269 U+5473 1085.2808580900494 2406.681036698191 U+7533 331.5384071751645 1752.133146587171 U+305E 896.3808722245066 2500.6850071957238 U+7CF8 908.6851742393092 460.75343081825656 U+6F2C 715.4025107935855 1052.1867290296052 U+5F80 710.5441766036184 1766.2526983963817 U+65AF 905.0663677014803 911.1696905838816 U+9152 903.0434377569901 1633.477204975329 U+65AF 1460.7020327919408 2017.4794407894738 U+6625 126.56360023900082 1178.9476575349506 U+5473 1085.4190224095394 2407.128970497533 U+3093 717.340087890625 1852.9627749794408 U+6368 1270.4909796463817 1654.0346808182567 U+6D25 715.7232264468545 1466.463044819079 U+306A 1097.6250256990131 1782.704435649671 U+8A00 710.9029348273026 2042.4533562911183 U+308A 721.4456979851974 625.8510870682566 U+304B 331.67576840049344 2041.5238229851973 U+307E 907.835115131579 1976.791927939967 U+3068 718.1916246916119 1929.5443564967106 U+5C0F 1466.2135716488488 2212.499935752467 U+3068 1093.5676655016448 802.44140625 U+3092 907.899748149671 1855.0790244654606 U+7740 896.6115851151316 2395.327212685033 U+3066 514.5893618935033 1185.076840049342 U+8272 894.4642880088404 1739.358239424856 U+8272 895.127595600329 1740.3805863229852 U+8A9E 1091.2972219366777 1263.7293765419408 U+3082 1093.7237548828125 2073.2401958264804 U+8FBA 516.0651919716283 1480.189787212171 U+3064 517.2871157997533 1778.4813489412006 U+8A00 708.8605378803454 2042.946134868421 U+307E 516.1130563836349 2068.3125706722863 U+304A 517.5923879523026 1694.324951171875 U+3057 904.2428428248355 1260.7447895250823 U+6CBB 706.8184862638775 841.9423956620066 U+306A 339.80953818873354 541.6703716077303 U+6625 700.1854183799342 2500.984015213816 U+7740 896.1893182051809 2395.2226177014804 U+6DF1 1456.5781121504933 2497.2016987047696 U+3057 904.541015625 1260.8499627364308 U+8A00 709.2402006450452 2043.6323948910363 U+305D 327.5954075863487 351.83882863898026 U+3065 1284.2921849300988 822.9420230263158 U+76EE 1096.9633082339637 1020.4916863692434 U+306F 136.451416015625 2510.3159693667762 U+304B 340.1655980160362 1946.9453510485198 U+3082 1479.197933799342 719.690583881579 U+9152 903.1192819695724 1633.7614360608552 U+3064 904.124177631579 1110.4356143349096 U+305A 1099.3813283819902 1983.5384971217106 U+8DE1 511.10701711554276 475.2226498252467 U+305B 515.7811215049342 2152.6224557976975 U+3082 1470.8983090049342 1498.8071160567433 U+3088 518.0453009354441 562.8365928248355 U+7533 332.1898129111842 1753.2416574578535 U+307E 906.4118234734786 1979.320068359375 U+7FA9 1094.5552785773027 563.7062474300986 U+5F8C 335.1133326480263 938.516267475329 U+4EBA 137.6297478926809 2414.0288342927633 U+306F 130.99700927734375 1285.2898206208881 U+725B 905.1322214226974 2096.1049290707238 U+3082 1093.8566509046052 2073.477879574424 U+3066 1462.7731805098683 2116.339978669819 U+306B 1472.2031763980262 1288.0656352796052 U+306A 1098.6502878289473 1781.1280581825658 U+4ECB 1085.881990131579 1671.5829307154606 U+307E 906.6069592927631 1978.1027703536183 U+9854 133.69924444901315 485.4395816200658 U+305F 1470.7798365542762 541.9966205797697 U+305F 337.99952456825656 1497.6207211143092 U+3092 144.60826673005758 2003.4012643914473 U+308C 903.1840434827303 1015.3769402754934 U+306A 1098.6508018092106 1781.9300922594573 U+308C 903.177490234375 1014.8756167763158 U+30CF 1447.7938039679277 2405.5403217516446 U+3055 1465.670166015625 1724.1139622738488 U+6C34 709.9304841694079 1576.512386924342 U+308A 1092.6137823807567 1449.527523643092 U+307E 516.366545024671 2068.5861687911183 U+8EAB 1267.1989360608552 347.6134611430921 U+3086 710.1867354543585 2245.894646895559 U+5A66 1091.8107524671052 1263.4482293379933 U+3068 1093.8572933799342 801.8844443873355 U+4ECB 1086.2880987870067 1671.6547032406456 U+3066 132.0623136821546 955.5069772820724 U+3068 1273.6213121916119 1756.6956851356908 U+3092 908.6309171977796 1853.9365105879933 U+3068 1475.189208984375 1058.873869243421 U+307E 138.15092387952302 731.6530890213816 U+30A4 515.4012900904605 2523.1772974917762 U+898B 142.65027497944078 2114.90478515625 U+3042 718.8565224095395 527.2592644942434 U+3082 1093.9617919921875 2072.474429481908 U+3064 904.5846396998355 1110.3124678762335 U+3068 718.367028487356 1929.3131617495887 U+3061 1286.4645867598683 928.3352179276316 U+3092 1097.6076788651317 1124.7738486842106 U+885B 714.2560376619037 1053.8371196546052 U+304A 339.7388337787829 1419.6309942948192 U+308C 903.9035676655017 1013.9545962685033 U+305A 1099.9431409333881 1982.6234837582238 U+3068 718.2435045744244 1929.7556023848683 U+3070 911.1824115953947 1353.7711053145558 U+5F80 709.4050678453947 1765.5051141036183 U+308A 912.5972707648026 2276.8598375822367 U+5F80 710.9590871710526 1764.6411614668996 U+7DCF 1093.9174290707238 2510.1553505345396 U+3070 910.8781995271381 1353.5954525596217 U+3093 716.9429096422697 1855.6077495374177 U+3066 1462.1475380345394 2116.460089432566 U+308B 579.443359375 1345.1422761615954 U+308A 1092.1414987664473 1450.4476768092106 U+6851 325.11561343544406 1086.5265856291119 U+3092 1098.7043842516448 1123.9520906147204 U+304B 331.8837376644737 2042.1421412417762 U+3068 1086.4086592824835 339.37888697574016 U+3092 1097.9195685135692 1124.0010793585527 U+3093 715.3810360557154 1852.762515419408 U+308A 912.5635729337994 2277.5212659333883 U+3048 1274.5212273848683 1392.9736649362665 U+3055 1282.8543251439144 458.41616981907896 U+308C 904.3056769120066 1014.2016440943668 U+308B 1275.527986225329 1831.6828998766448 U+6625 126.87867817125823 1179.502595600329 U+304A 517.5892398231908 1695.2866403680098 U+3068 1086.170718544408 339.9405067845395 U+30CF 704.4187324925473 1153.3937634919819 U+7DCF 1093.9943976151317 2509.8869885896383 U+5C0F 699.1992709511205 2406.1623021175988 U+308C 1466.8422337582238 335.6342195209704 U+305A 1099.7588147615131 1982.5263414884869 U+76EE 1097.3900082236842 1019.7116570723684 U+5C0F 137.25290398848685 1056.4408151726973 U+6625 1459.4621839021381 2314.1363846628287 U+305A 1099.9928042763158 1983.4787469161183 U+306E 1480.9937808388158 806.6030401932566 U+306F 899.588623046875 1516.068693462171 U+3092 1097.4099249588817 1124.733244243421 U+308A 720.3950500488281 626.7439350328947 U+5C0F 137.03426963404604 1055.0395443564967 U+30CF 704.0902709960938 1153.9228258634869 U+3084 1478.735929790296 971.801115337171 U+304B 339.46921900699016 1946.5726511101973 U+3093 345.21060341282896 432.99759714226974 U+306B 141.13597669099508 609.9349172491776 U+76EE 1095.9783935546875 1019.522705078125 U+307E 906.6044697008634 1979.309724506579 U+6625 702.9624657881888 2501.2248792146383 U+306F 899.232755962171 1516.0322008634869 U+307F 337.65419407894734 1658.7850791529606 U+3092 145.37628173828125 2004.0010793585527 U+91CC 515.9712299547697 1080.9034969932154 U+3092 136.08037045127466 1529.0874280427631 U+3092 907.7810829564145 1855.513257478413 U+5185 336.0538844058388 1213.0079731188323 U+306E 1468.1562885485198 1590.755679481908 U+3084 1478.6477821751644 973.5096419484992 U+3092 908.5457570929276 1855.3419253700658 U+3061 133.79698100842927 860.8409359580592 U+96E8 325.5032509251645 2232.66845703125 U+3082 905.9367290296053 574.3771201685855 U+3042 718.1997279116982 526.7445132606908 U+3057 1475.1308079769738 444.9873111122533 U+3075 705.6791606702303 1667.4754574424342 U+308A 1461.4456979851973 1809.3007298519738 U+56DB 140.73836477179276 1802.2342722039473 U+3055 1465.7520816200658 1723.0807334498356 U+5F80 708.5525271767065 1764.7904084858142 U+3044 1267.3277523643092 2453.1280838815787 U+3082 1093.824398643092 2074.4833213404604 U+3086 340.56190892269734 2377.47802734375 U+304C 528.9605712890625 982.1078973067434 U+3068 1273.552824321546 1757.7347926089637 U+305F 902.444490131579 2194.8129754317433 U+308B 1275.692202919408 1831.0555548416942 U+305F 706.1995656866776 1236.6441264905427 U+76EE 1095.7761101973683 1019.2014272589432 U+3078 1268.7333598889802 2517.4469315378287 U+306B 1482.4187268708881 889.4474069695724 U+3051 142.18446430407073 1706.7092413651317 U+3042 1276.671078330592 2129.314093338816 U+305F 706.4756052117599 1235.7685932360198 U+304E 1272.6707056949012 2293.4757915296054 U+307E 513.7980250308389 1084.5474403782894 U+6C34 710.6319467644943 1575.8243600945723 U+3046 520.8293392783717 2244.5624743009867 U+3089 1095.1204320004113 1880.1529733758223 U+308B 1285.7620399876644 753.3020662006579 U+3051 141.7184769479852 1706.148247969778 U+3064 904.3210320723684 1107.4259868421052 U+308A 1460.9303685238488 1810.3577945106908 U+30F2 522.7592709189967 2462.2928659539475 U+8FBA 136.32436651932565 1892.7779990748356 U+725B 905.5714175575658 2094.742367393092 U+4EBA 711.1490671258224 2185.2338610197367 U+4E09 1089.2656506990131 2330.692202919408 U+3093 715.4842979029605 1850.1756848787006 U+3064 1275.0446520353619 1479.8360402960527 U+3072 1095.7631643194902 2169.9404425370067 U+3066 131.89095748098273 953.1322358783923 U+3064 905.1888074372944 1107.8329307154606 U+305F 902.9952039216694 2194.3838019120067 U+3070 1287.6667865953948 1283.4055047286183 U+305F 712.8906892475329 2129.4318590666117 U+3089 1094.8872777035363 1880.5741802014802 U+3068 1281.9822933799342 1009.7676568282278 U+6851 327.2810765316612 1084.6400853207238 U+3078 710.7505316483347 2185.6015496504933 U+3068 1283.428312602796 1012.5192742598684 U+8FBA 136.6832853618421 1893.7132182874177 U+308B 146.6060437654194 2213.437564247533 U+3082 905.4520295795642 574.453703227796 U+56DB 140.3250443307977 1803.8236919202302 U+3092 335.3596255653783 2521.5279348273025 U+4E09 1089.4585217927631 2330.4913651315787 U+304A 1292.8885048314144 1196.0054096422698 U+3078 340.8290501644737 2317.8311317845396 U+3062 1284.9663342927631 936.9234667326275 U+3089 1093.6265162417762 1880.8183850740131 U+3075 705.3466796875 1667.512785259046 U+304C 529.0720407586349 983.0389966462787 U+304B 1466.6996684827302 1655.9707159745067 U+3075 706.8586570338199 1667.0728864167866 U+305F 712.4049136513158 2128.680516293174 U+304B 1090.3290437397204 1372.0255319695723 U+3068 1271.2021355879933 2364.6295487253287 U+3057 901.7612176192434 1439.3221242804277 U+3089 716.8220600328947 358.75452945106906 U+307F 338.17967465049344 1658.055399844521 U+304A 1292.9927143297698 1197.4222604851973 U+3057 901.9377216539884 1439.0163702713817 U+3031 1284.440982216283 1090.6708727384869 U+5185 335.5763003700658 1211.1588969983552 U+304B 1090.4720908717106 1371.4845356188323 U+305F 706.184724506579 1233.292172080592 U+305F 706.7488499691611 1233.3010382401317 U+306B 707.727629009046 1667.2307385896381 U+6C34 133.5628790604441 1406.1084305612665 U+3072 1092.52197265625 2170.2349532277963 U+65AF 903.6528577302631 930.668178357576 U+6368 1270.533383018092 1663.6147589432567 U+3031 1284.5628597861842 1090.2335397820723 U+305B 517.3847078022204 2143.8057668585525 U+65AF 903.4485023900082 930.8463247198807 U+305F 713.0338969983553 2129.6747789884867 U+304F 705.7869037828947 2321.1808696546054 U+3078 1283.6342259457238 531.0171026932566 U+7D19 335.54019325657896 948.700280440481 U+3089 1093.312345805921 1879.5036878083881 U+30FD 907.1244731702303 1171.1378238075658 U+753A 519.6291311163651 924.3524250231291 U+767D 716.2455428274054 357.9248046875 U+305F 712.7877646998355 2129.561703330592 U+304B 1467.3957262541119 1654.7738446687397 U+725B 1093.4792608963817 1238.8223427220394 U+6597 1257.6942845394738 2130.0275621916117 U+304B 137.52727307771383 1623.1197959498356 U+725B 1093.0674663342927 1238.605314555921 U+304F 907.0872738486842 1170.8194130345394 U+3059 1264.0010793585527 2025.5779065583881 U+3055 1262.4492444490131 916.0238486842105 U+304F 706.6492582622327 2319.8507529810854 U+306B 133.5525833932977 2315.1559930098683 U+725B 1249.9497584292762 1649.4282612047698'
viz = visualize_predictions(image_fn, pred_string)

plt.figure(figsize=(15, 15))
plt.imshow(viz, interpolation='lanczos')