#### LIBRARIES

In [1]:
import random
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path

#### UTILITIES

In [2]:
def load_json(path):
    """
    Load a JSON file given a path.
    
    Parameters
    ----------
    path
        The path to the JSON file.
        
    Returns
    -------
    data
        The content of the JSON file.
    """
    
    with open(path) as json_file:
        data = json.load(json_file)
    return data

#### DATA

In [3]:
data_dir = Path.cwd()/"AI4Code"
train_dir = data_dir/"train"

In [4]:
train_ancestors = pd.read_csv(data_dir/"train_ancestors.csv")
train_orders = pd.read_csv(data_dir/"train_orders.csv")

#### A LOOK AT SAMPLE TRAIN DATA

In [5]:
print(f"Train data files: {'{:,}'.format(len(os.listdir(train_dir)))}")

Train data files: 139,256


In [6]:
# Get a random file.
sample_file = random.choice(os.listdir(train_dir))
sample_file

'6ea6ec463c5dfe.json'

In [7]:
# Load the file, check out the keys.
sample = load_json(train_dir/sample_file)
sample.keys()

dict_keys(['cell_type', 'source'])

In [8]:
sample['cell_type'].keys() == sample['source'].keys() # The keys are the same.

True

In [16]:
# Cell type and source.
sample['cell_type'][list(sample['cell_type'].keys())[0]], sample['source'][list(sample['cell_type'].keys())[0]]

('code',
 "import tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nimport tensorflow_addons as tfa\nfrom kaggle_datasets import KaggleDatasets\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport re\nimport os\nimport math\nimport random\nimport cv2\n\ntry:\n    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n    print('Device:', tpu.master())\n    tf.config.experimental_connect_to_cluster(tpu)\n    tf.tpu.experimental.initialize_tpu_system(tpu)\n    strategy = tf.distribute.experimental.TPUStrategy(tpu)\nexcept:\n    strategy = tf.distribute.get_strategy()")

In [17]:
# What do the labels look like?
sample_orders = train_orders[train_orders.id==sample_file.split('.')[0]]
sample_orders['cell_order'].squeeze()

'934a8d45 bb934795 cf720344 4a5f0b63 cfae70e2 eadd3d40 8c2929ec 9e54e28a e0b01374 dc0ca82c 065e4075 92db3ceb a1f6ce69 594628fa 6199ece2 ccea337e 8ac58ade 73fd6c47 c0cc4c7d 416f75e9 ce3d282f 0eb60356 c69a36d7 f58f9581 9ccb3c04 a8f46cfa 1a823039 22948d33 b7b56332'

In [21]:
# Un-huh?
# What about the ancestors?
sample_ancestors = train_ancestors[train_ancestors.id==sample_file.split('.')[0]]
sample_ancestors

Unnamed: 0,id,ancestor_id,parent_id
59984,6ea6ec463c5dfe,bceebd93,


In [26]:
# What is the ancestor_id?
train_ancestors.ancestor_id.value_counts()[:5]

4569bfc1    65
51dd4f97    49
021526f8    44
09489e94    43
c65a23e9    41
Name: ancestor_id, dtype: int64

In [28]:
# Ancestor ids with more than one reference notebook?
# The notebooks all have the same parent.
train_ancestors[train_ancestors.ancestor_id=='4569bfc1']

Unnamed: 0,id,ancestor_id,parent_id
2613,04fdcde7f55004,4569bfc1,9d45e544c18f00
4449,0841913c81f268,4569bfc1,9d45e544c18f00
4918,0920005584e83d,4569bfc1,9d45e544c18f00
9248,1116c472def990,4569bfc1,9d45e544c18f00
10760,13c05cd8dde798,4569bfc1,9d45e544c18f00
...,...,...,...
134663,f793a077c860b1,4569bfc1,9d45e544c18f00
136204,fa5c26e0d7006c,4569bfc1,9d45e544c18f00
136260,fa7292da3a2d0f,4569bfc1,9d45e544c18f00
137076,fbf36be84a7e6e,4569bfc1,9d45e544c18f00
