In [None]:
import pickle
import os
from pathlib import Path
from PIL import Image

# Notebook widget for interactive exploration
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import cv2 as cv
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from src.data.load_datasets import load_label_csv, load_data_train, load_data_val
from src.data.load_datasets import load_grapheme_classes
from src.data.grapheme_composition import encode_grapheme
from src.data.grapheme_composition import get_components
from src.data.data_labels import get_labels
from src.data.data_labels import filter_label_df_index

In [None]:
# Load the .ENV path. 
load_dotenv(find_dotenv())

# Get Env variable on the pathing. 

PATH_DATA_INTERIM=os.getenv("PATH_DATA_INTERIM")
PATH_DATA_RAW=os.getenv("PATH_DATA_RAW")

In [None]:
# Load the labeling data for decoding purpose
grapheme_train = load_label_csv()
# Load training data
data_train = load_data_train()
# Load validation data
data_val = load_data_val()

In [None]:
# Load the class data for decoding purpose
grapheme_classes = load_grapheme_classes()
grapheme_classes

In [None]:
list_labels = data_train[2][1]
list_labels

In [None]:
get_components(list_labels)

In [None]:
df_label_train = get_labels(data_train)
df_label_val = get_labels(data_val)

In [None]:
df_label_train

In [None]:
df_label_val

In [None]:
total_root = grapheme_classes[grapheme_classes.component_type.eq("grapheme_root")]
total_vowel = grapheme_classes[grapheme_classes.component_type.eq("vowel_diacritic")]
total_consonant = grapheme_classes[grapheme_classes.component_type.eq("consonant_diacritic")]

len(total_root)
total_vowel
total_consonant

@interact
def show_count(index_root=(0, len(total_root)-1, 1), 
               index_vowel=(0, len(total_vowel)-1, 1), 
               index_consonant=(0, len(total_consonant)-1, 1)):            
    
    subset_train = filter_label_df_index(df_label_train, index_root, index_vowel,index_consonant)    
    subset_val = filter_label_df_index(df_label_val, index_root, index_vowel,index_consonant)    
    encode_grapheme(index_root, index_vowel,index_consonant)
    print(f"There are a total of {len(subset_train)} cases in the training AND {len(subset_val)} cases in validation data set.")
    print(f"Proportion of {len(subset_train)/len(df_label_train)*100}% in the training AND {len(subset_val)/len(df_label_val)*100}% in validation data set.")
    #return (subset_train, subset_val)
    
    # Exit early if no images. 
    if len(subset_train)==0 or len(subset_val)==0:
        return
    
    @interact
    def show_image(train_index=(0,len(subset_train)-1,1), val_index=(0,len(subset_val)-1,1)):    
        index_train=subset_train[train_index]
        index_val=subset_val[val_index]
        # Load the training image index    
        img_train = Image.fromarray(data_train[index_train][0], "L")
        img_val = Image.fromarray(data_val[index_val][0], "L")

        #get_components(list_labels)

        # Compose into the composite array laytout
        f, axarr = plt.subplots(1,2)
        axarr[0].imshow(img_train, cmap='gray')
        axarr[0].set_title("Training Set")
        axarr[1].imshow(img_val, cmap='gray')    
        axarr[1].set_title("Validation Set")
        f.set_size_inches(18.5, 10.5)
        #f.suptitle("Raw Grapheme Images", fontsize=40)