# Load Data

Based in https://www.kaggle.com/code/durvalslompojunior/bmga-train-dataset-csv-meta/edit

In [None]:
from glob import glob
import PIL.Image as Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
annotations = glob("/kaggle/input/benetech-making-graphs-accessible/train/annotations/*.json")
"Total Training Samples", len(annotations)

In [None]:
import json
from tqdm import tqdm
annotations_data = [json.load(open(path,'r')) for path in tqdm(annotations, total=len(annotations))]

In [None]:
test = annotations_data[0]
print(test.keys())
print(test['plot-bb'].keys())
print(test['axes'].keys())

In [None]:
import pandas as pd
train = pd.DataFrame(annotations, columns=['path'])

In [None]:
train
train['name'] = train['path'].apply(lambda x: x.split('/')[-1].replace(".json", ""))
train['annotation'] = "annotations/"+train['name']+".json"
train['image'] = "images/"+train['name']+".jpg"
train['source'] = [x['source'] for x in annotations_data]
train['chart-type'] = [x['chart-type'] for x in annotations_data]
for key in test['plot-bb'].keys():
    train[f"{key}"] = [x['plot-bb'][key] for x in annotations_data]
    #train[f"{key}_len"] = [len(x['plot-bb'][key]) for x in annotations_data]
for key in test['axes'].keys():
    train[f"{key}"] = [x['axes'][key] for x in annotations_data]
    #train[f"{key}_len"] = [len(x['axes'][key]) for x in annotations_data]
train['text'] = [x['text'] for x in annotations_data]

train['data-series'] = [x['data-series'] for x in annotations_data]
train

In [None]:
train.groupby(by=['source', 'chart-type'])['path'].count()

In [None]:
train.groupby(by=['chart-type'])['path'].count()

In [None]:
train.to_csv("train.csv", index=False)

In [None]:
root_dir = "/kaggle/input/benetech-making-graphs-accessible/train"

## Load an graph

In [None]:
index = 10
graph = np.array(Image.open(root_dir+"/"+train['image'][index]+""))
fig, ax = plt.subplots(1,1)
ax.set_title(label=train['name'][index])
ax.imshow(graph, cmap='gray')
plt.show()

In [None]:
rect = (train['x0'][index], train['y0'][index], train['width'][index], train['height'][index])
fig, ax = plt.subplots()
ax.set_title(label=f"{train['name'][index]}-chart")
ax.imshow(graph, cmap = "gray")
patch = patches.Rectangle((rect[0], rect[1]), rect[2], rect[3], linewidth=2, edgecolor='r', facecolor='none')
ax.add_patch(patch)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.set_title(label=f"{train['name'][index]}-x-axis-ticks")
width_step = 4/2
ax.imshow(graph, cmap = "gray")
for xaxis in  train['x-axis'][index]['ticks']:
    rect = (xaxis['tick_pt']['x']-width_step, xaxis['tick_pt']['y']-width_step, +width_step, +width_step)
    patch = patches.Rectangle((rect[0], rect[1]), rect[2], rect[3], linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(patch)
plt.show()

In [None]:
fig, ax = plt.subplots()
width_step = 4/2
ax.imshow(graph, cmap = "gray")
ax.set_title(label=f"{train['name'][index]}-y-axis-ticks")
for yaxis in  train['y-axis'][index]['ticks']:
    rect = (yaxis['tick_pt']['x']-width_step, yaxis['tick_pt']['y']-width_step, +width_step, +width_step)
    patch = patches.Rectangle((rect[0], rect[1]), rect[2], rect[3], linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(patch)
plt.show()

In [None]:
fig, ax = plt.subplots()
width_step = 4/2
ax.imshow(graph, cmap = "gray")
ax.set_title(label=f"{train['name'][index]}-axis-data-and-chart-meta")
for text in  train['text'][index]:
    rect = (text['polygon']['x0'],text['polygon']['y0'], text['polygon']['x1'] - text['polygon']['x0'], text['polygon']['y2'] - text['polygon']['y1'])
    patch = patches.Rectangle((rect[0], rect[1]), rect[2], rect[3], linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(patch)
plt.show()

In [None]:
train['chart-type'][index], train['data-series'][index]

# Resize images

In [None]:
train.describe()

In [None]:
from PIL import Image
import PIL
import os
import glob

index = 1
image = Image.open(root_dir+"/"+train['image'][index]+"")
width = image.size[0]
height = image.size[1]
print(f'index: {index}')
print(f'width: {width}')
print(f'height: {height}')

In [None]:
image

In [None]:
base_width = 360

width_percent = (base_width / float(image.size[0]))
hsize = int((float(image.size[1]) * float(width_percent)))
image = image.resize((base_width, hsize), PIL.Image.LANCZOS)
image = image.convert('L')

In [None]:
image 

In [None]:
image.save('resized_compressed_image.jpg')

In [None]:
os.makedirs('/kaggle/working/train/images')

In [None]:
def resize_and_grayscale_image(root_dir, file_path, save = True, base_width = 360):
    image = Image.open(root_dir+"/"+file_path+"")
    width_percent = (base_width / float(image.size[0]))
    hsize = int((float(image.size[1]) * float(width_percent)))
    image = image.resize((base_width, hsize), PIL.Image.LANCZOS)
    image = image.convert('L')
    if save:
        image.save('/kaggle/working/train/'+file_path)
    return image

In [None]:
for file_path in train['image']:
    resize_and_grayscale_image(root_dir, file_path)

# Predicting Chart Types

In [None]:
# Next steps...

# Predicting X and Y and OCR

In [None]:
# Next steps...