In [None]:
from dotenv import load_dotenv, find_dotenv

# Load the .ENV path. 
load_dotenv(find_dotenv())

# Get Env variable on the pathing. 
import os
PATH_DATA_INTERIM=os.getenv("PATH_DATA_INTERIM")
PATH_DATA_RAW=os.getenv("PATH_DATA_RAW")

In [None]:
import pickle
from pathlib import Path

# Load the labeling data for decoding purpose
import pandas as pd
grapheme_train = pd.read_csv(Path(PATH_DATA_RAW) / "train.csv")
grapheme_train

# Load the data, ~5GB
with open(Path(PATH_DATA_INTERIM) / "train_data.p", 'rb') as pickle_file:
    data_train = pickle.load(pickle_file)

# Load the validation data, about 1.3GB
with open(Path(PATH_DATA_INTERIM) / "val_data.p", 'rb') as pickle_file:
    data_val = pickle.load(pickle_file)

In [None]:
# Load the class data for decoding purpose
import pandas as pd
grapheme_classes = pd.read_csv(Path(PATH_DATA_RAW) / "class_map.csv")
grapheme_classes

In [None]:
list_labels = data_train[2][1]
list_labels

In [None]:
def get_components(input_list:list):    
    assert len(input_list)==3
    component1=grapheme_classes[grapheme_classes.component_type.eq("grapheme_root") & grapheme_classes.label.eq(input_list[0])].iloc[0].component
    component2=grapheme_classes[grapheme_classes.component_type.eq("vowel_diacritic") & grapheme_classes.label.eq(input_list[1])].iloc[0].component
    component3=grapheme_classes[grapheme_classes.component_type.eq("consonant_diacritic") & grapheme_classes.label.eq(input_list[2])].iloc[0].component
    return [component1, component2, component3]
get_components(list_labels)

In [None]:
classes_train = list(list(zip(*data_train))[1])


In [None]:
classes_train.filter()

In [None]:
total_root = grapheme_classes[grapheme_classes.component_type.eq("grapheme_root")]
total_vowel = grapheme_classes[grapheme_classes.component_type.eq("vowel_diacritic")]
total_consonant = grapheme_classes[grapheme_classes.component_type.eq("consonant_diacritic")]

len(total_root)
total_vowel
total_consonant
import ipywidgets as widgets
from ipywidgets import interact, interact_manual


# Eg. 44.3.0

@interact
def show_count(index_root=(0, len(total_root)-1, 1), 
               index_vowel=(0, len(total_vowel)-1, 1), 
               index_consonant=(0, len(total_consonant)-1, 1)):            
    # Look up the entries that have the same classifications.     
    d = classes_train[classes_train.grapheme_root.eq(index_root) &
                      classes_train.vowel_diacritic.eq(index_vowel) &
                      classes_train.consonant_diacritic.eq(index_consonant)] 
    print(len(d))
    print(d)
    

In [None]:
# Dual showing the images: training on the left and validation on the right. 
# Showing classes and instance of those classes: 

# 

# Get the classification from the label
a, b, c = tuple(data_train[index_train][1])
# Look up the entries that have the same classifications.     
d = grapheme_train[grapheme_train.grapheme_root.eq(a) & grapheme_train.vowel_diacritic.eq(b) & grapheme_train.consonant_diacritic.eq(c)] 
# Show the first one and its grapheme (since they all should have the same grapheme)
from IPython.display import display, Markdown
display(Markdown('<h1>{}</h1>'.format(f"Character{d.iloc[0].grapheme}")))
#print(f"Character{d.iloc[0].grapheme}")



@interact
def show_image(index_train=(0,len(data_train)-1,1), index_val=(0,len(data_val)-1,1)):
    
    # Load the training image index    
    img_train = Image.fromarray(data_train[index_train][0], "L")
    
    # Load the validation image index
    img_val = Image.fromarray(data_train[index_val][0], "L")

    # Compose into the composite array laytout
    f, axarr = plt.subplots(1,2)
    axarr[0].imshow(img_train, cmap='gray')
    axarr[0].set_title("Training Set")
    axarr[1].imshow(img_val, cmap='gray')    
    axarr[1].set_title("Validation Set")
    f.set_size_inches(18.5, 10.5)
    f.suptitle("Raw Grapheme Images", fontsize=40)
    
    # Get the classification from the label
    a, b, c = tuple(data_train[index_train][1])
    # Look up the entries that have the same classifications.     
    d = grapheme_train[grapheme_train.grapheme_root.eq(a) & grapheme_train.vowel_diacritic.eq(b) & grapheme_train.consonant_diacritic.eq(c)] 
    # Show the first one and its grapheme (since they all should have the same grapheme)
    from IPython.display import display, Markdown
    display(Markdown('<h1>{}</h1>'.format(f"Character{d.iloc[0].grapheme}")))
    #print(f"Character{d.iloc[0].grapheme}")
    