
# Bengali.AI Competition - Ensemble

### Team MuchLearningSuchWow

This notebook contains the code that we used to test our ensemble of networks that consists a CNN, ResNet, and two variations of ResNext.

## Imports

In [None]:
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm
import joblib
import os
from keras.models import load_model
from keras.utils import generic_utils

## Deserialize Keras Object

In [None]:
def deserialize_keras_object(identifier, module_objects=None,
                             custom_objects=None,
                             printable_module_name='object'):
    if identifier is None:
        return None
    if isinstance(identifier, dict):
        config = identifier
        if 'class_name' not in config or 'config' not in config:
            raise ValueError('Improper config format: ' + str(config))
        class_name = config['class_name']
        if custom_objects and class_name in custom_objects:
            cls = custom_objects[class_name]
        elif class_name in _GLOBAL_CUSTOM_OBJECTS:
            cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
        else:
            module_objects = module_objects or {}
            cls = module_objects.get(class_name)
            if cls is None:
                from keras.metrics import Recall
                cls = Recall
        if hasattr(cls, 'from_config'):
            custom_objects = custom_objects or {}
            if has_arg(cls.from_config, 'custom_objects'):
                return cls.from_config(
                    config['config'],
                    custom_objects=dict(list(_GLOBAL_CUSTOM_OBJECTS.items()) +
                                        list(custom_objects.items())))
            with CustomObjectScope(custom_objects):
                return cls.from_config(config['config'])
        else:
            custom_objects = custom_objects or {}
            with CustomObjectScope(custom_objects):
                return cls(**config['config'])
    elif isinstance(identifier, six.string_types):
        function_name = identifier
        if custom_objects and function_name in custom_objects:
            fn = custom_objects.get(function_name)
        elif function_name in _GLOBAL_CUSTOM_OBJECTS:
            fn = _GLOBAL_CUSTOM_OBJECTS[function_name]
        else:
            fn = module_objects.get(function_name)
            if fn is None:
                raise ValueError('Unknown ' + printable_module_name +
                                 ':' + function_name)
        return fn
    else:
        raise ValueError('Could not interpret serialized ' +
                         printable_module_name + ': ' + identifier)

In [None]:
generic_utils.deserialize_keras_object.__code__ = deserialize_keras_object.__code__

## Define Resize Function

In [None]:
def resize(df, size=64, need_progress_bar=True):
    resized = {}
    resize_size=64
    if need_progress_bar:
        for i in tqdm(range(df.shape[0])):
            image=df.loc[df.index[i]].values.reshape(137,236)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]

            idx = 0
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    else:
        for i in range(df.shape[0]):
            image=df.loc[df.index[i]].values.reshape(137,236)
            _, thresh = cv2.threshold(image, 30, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            contours, _ = cv2.findContours(thresh,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)[-2:]

            idx = 0
            ls_xmin = []
            ls_ymin = []
            ls_xmax = []
            ls_ymax = []
            for cnt in contours:
                idx += 1
                x,y,w,h = cv2.boundingRect(cnt)
                ls_xmin.append(x)
                ls_ymin.append(y)
                ls_xmax.append(x + w)
                ls_ymax.append(y + h)
            xmin = min(ls_xmin)
            ymin = min(ls_ymin)
            xmax = max(ls_xmax)
            ymax = max(ls_ymax)

            roi = image[ymin:ymax,xmin:xmax]
            resized_roi = cv2.resize(roi, (resize_size, resize_size),interpolation=cv2.INTER_AREA)
            resized[df.index[i]] = resized_roi.reshape(-1)
    resized = pd.DataFrame(resized).T
    
    return resized

## Load Linear Vector Support Machines

In [None]:
clf_a = joblib.load('../input/ensemble/clf_0')
clf_b = joblib.load('../input/ensemble/clf_1')
clf_c = joblib.load('../input/ensemble/clf_2')
clfs = [clf_a, clf_b, clf_c]

## Testing

In [None]:
model_files = [
    '../input/models/resnext.h5', 
    '../input/models/cnn.h5', 
    '../input/models/resnet-56.h5', 
    '../input/models/resnet-38.h5'
]

n_classes = [168, 11, 7]

image_ids = []
image_preds = []

components = ['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']

In [None]:
for i in range(4):
    test_df = pd.read_parquet(f'../input/bengaliai-cv19/test_image_data_{i}.parquet')
    x_test = test_df.drop(['image_id'], axis=1)
    x_test = resize(x_test) / 255

    # CNN takes images in shape `(batch_size, h, w, channels)`, so reshape the images
    x_test = x_test.values.reshape(-1, 64, 64, 1)

    predictions = (
        np.empty((len(x_test), 168 * len(model_files)), dtype=np.float32),
        np.empty((len(x_test), 11 * len(model_files)), dtype=np.float32),
        np.empty((len(x_test), 7 * len(model_files)), dtype=np.float32)
    )

    # Make model predictions     
    for model_index, model_file in tqdm(enumerate(model_files), total=len(model_files)):
        model = load_model(model_file)
        preds = model.predict(x_test, batch_size=96)
        for j in range(3):
            predictions[j][:, model_index * n_classes[j]:(model_index + 1) * n_classes[j]] = preds[j]

    ensemble_predictions = []

    # Combine predicted scores using linear support vector machines
    for j in range(3):
        ensemble_predictions.append(clfs[j].predict(predictions[j]))

    for k, id in enumerate(test_df['image_id']):
        for j, comp in enumerate(components):
            id_sample = id + '_' + comp
            image_ids.append(id_sample)
            image_preds.append(ensemble_predictions[j][k])

## Write Submission

In [None]:
df_sample = pd.DataFrame(
    {
        'row_id': image_ids,
        'target': image_preds
    },
    columns = ['row_id','target']
)

df_sample['target'] = df_sample['target'].astype(np.int)

df_sample.to_csv('submission.csv',index=False)