In [1]:
import pickle
import os
from pathlib import Path
from PIL import Image

# Notebook widget for interactive exploration
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import cv2 as cv
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from src.data.load_datasets import load_label_csv, load_data_train, load_data_val
from src.data.load_datasets import load_grapheme_classes
from src.data.load_datasets import get_image_data
from src.data.grapheme_composition import encode_grapheme
from src.data.grapheme_composition import get_components
from src.data.data_labels import get_labels
from src.data.data_labels import filter_label_df_index

In [2]:
# Load the .ENV path. 
load_dotenv(find_dotenv())

# Get Env variable on the pathing. 

PATH_DATA_INTERIM=os.getenv("PATH_DATA_INTERIM")
PATH_DATA_RAW=os.getenv("PATH_DATA_RAW")

In [3]:
# Load the labeling data for decoding purpose
grapheme_train = load_label_csv()
# Load training data
data_train = load_data_train()
# Load validation data
data_val = load_data_val()

In [4]:
classes_train = list(list(zip(*data_train))[0])

In [5]:
# Load the class data for decoding purpose
grapheme_classes = load_grapheme_classes()
grapheme_classes

Unnamed: 0,component_type,label,component
0,grapheme_root,0,ং
1,grapheme_root,1,ঃ
2,grapheme_root,2,অ
3,grapheme_root,3,আ
4,grapheme_root,4,ই
...,...,...,...
181,consonant_diacritic,2,র্
182,consonant_diacritic,3,র্য
183,consonant_diacritic,4,্য
184,consonant_diacritic,5,্র


In [6]:
list_labels = data_train[2][1]
list_labels

array([133,   9,   0], dtype=int64)

In [7]:
get_components(list_labels)

['শ', 'ো', '0']

In [8]:
df_label_train = get_labels(data_train)
df_label_val = get_labels(data_val)

In [9]:
df_label_train

Unnamed: 0,grapheme_root,vowel_diacritic,consonant_diacritic
0,139,1,0
1,92,7,0
2,133,9,0
3,43,1,4
4,32,7,0
...,...,...,...
160667,95,0,0
160668,64,0,0
160669,113,2,2
160670,160,0,0


In [10]:
df_label_val

Unnamed: 0,grapheme_root,vowel_diacritic,consonant_diacritic
0,80,7,2
1,96,7,0
2,96,1,0
3,138,9,0
4,89,1,5
...,...,...,...
40163,96,7,0
40164,124,1,4
40165,72,1,1
40166,147,3,0


In [12]:
total_root = grapheme_classes[grapheme_classes.component_type.eq("grapheme_root")]
total_vowel = grapheme_classes[grapheme_classes.component_type.eq("vowel_diacritic")]
total_consonant = grapheme_classes[grapheme_classes.component_type.eq("consonant_diacritic")]

len(total_root)
total_vowel
total_consonant

@interact
def show_count(index_root=(0, len(total_root)-1, 1), 
               index_vowel=(0, len(total_vowel)-1, 1), 
               index_consonant=(0, len(total_consonant)-1, 1)):            
    
    subset_train = filter_label_df_index(df_label_train, index_root, index_vowel,index_consonant)    
    subset_val = filter_label_df_index(df_label_val, index_root, index_vowel,index_consonant)
    
    encode_grapheme(index_root, index_vowel,index_consonant)
    
    print(f"There are a total of {len(subset_train)} cases in the training AND {len(subset_val)} cases in validation data set.")
    print(f"Proportion of {len(subset_train)/len(df_label_train)*100}% in the training AND {len(subset_val)/len(df_label_val)*100}% in validation data set.")
    #return (subset_train, subset_val)
    
    # Exit early if no images. 
    if len(subset_train)==0 or len(subset_val)==0:
        return
    
    @interact
    def show_average_image(normalize=True, threshold=(0,255,0.5)):    

        # Compute its mean 
        images_train = get_image_data(data_train)        
        images_train_per_class = [images_train[i] for i in subset_train]        
        image_train_mean = np.mean(images_train_per_class, axis=0)
        print(len(subset_train))
        
        # Compute its mean 
        images_val = get_image_data(data_val)        
        print(len(images_val))
        images_val_per_class = [images_val[i] for i in subset_val]
        image_val_mean = np.mean(images_val, axis=0)
        
        if (normalize):        
            image_train_mean_norm = np.zeros((137, 236))
            image_train_mean_norm = cv.normalize(image_train_mean,  image_train_mean_norm, 0, 255, cv.NORM_MINMAX)     
            
            image_val_mean_norm = np.zeros((137, 236))
            image_val_mean_norm = cv.normalize(image_val_mean,  image_val_mean_norm, 0, 255, cv.NORM_MINMAX)    
            
            image_train_mean_binarized = 1.0 * (image_train_mean_norm < threshold)
            image_val_mean_binarized = 1.0 * (image_val_mean_norm < threshold)
        else:    
        
            image_train_mean_binarized = 1.0 * (image_train_mean < threshold)
            image_val_mean_binarized = 1.0 * (image_val_mean < threshold)
        
        #get_components(list_labels)

        # Compose into the composite array laytout
        f, axarr = plt.subplots(1,2)
        axarr[0].imshow(image_train_mean_binarized, cmap='gray')
        axarr[0].set_title("Training Set")
        axarr[1].imshow(image_val_mean_binarized, cmap='gray')    
        axarr[1].set_title("Validation Set")
        f.set_size_inches(18.5, 10.5)
        #f.suptitle("Raw Grapheme Images", fontsize=40)

interactive(children=(IntSlider(value=83, description='index_root', max=167), IntSlider(value=5, description='…