In [1]:
MODELS_LIST = [
    'model1', 'model2', 'model3', 'model4', 'model5', 
    'model6', 'model7', 'model8', 'model9', 'model10',  
    'model11', 'model12', 'model13', 'model14', 'model15', 
    'model16', 'model17', 'model19', 'model20', 'model21', 
    'model22', 'model23', 'model24', 'model25', 'model26', 
    'model27', 'model28', 'model29', 'model30', 'model31', 
    'model32', 'model34', 'model35', 'model36',
    'model2_new', 'model3_new', 'model4_new', 'model5_new', 'model6_new', 
    'model7_new',  'model9_new', 'model10_new', 'model15_new', 'model17_new',
    'model18_new', 'model19_new', 'model20_new',
    'model3_tone', 'model4_tone', 'model5_tone', 'model7_tone', 'model6_tone',
    'model9_tone', 'model10_tone', 'model20_tone',
    'model19_new_tone', 'model30_tone', 'model33_tone', 'model34_tone', 'model35_tone',
    'model1_synth',  'model2_synth', 'model3_synth', 'model4_synth', 
    'model5_synth', 'model6_synth', 'model7_synth', 'model8_synth' ,'model9_synth', 
    'model10_synth', 'model26_synth', 'model28_synth', 'model30_synth', 'model34_synth',
    'model35_synth', 'model36_synth',
    'model4_synth_new', 'model5_synth_new', 'model10_synth_new', 'model15_synth_new', 'model19_synth_new',
    'model20_synth_new',
    'model5_synth_tone', 'model19_synth_tone', 'model15_synth_tone',
    'model4_synth_new_tone', 'model15_synth_new_tone', 'model19_synth_new_tone',
]

In [2]:
import numpy as np
import pandas as pd
import pickle

from ensemble import *

  from .autonotebook import tqdm as notebook_tqdm


### 1. Test on Validation Set

In [3]:
val_model_frame = get_model_frame(MODELS_LIST)
model_dict = {model: i for i, model in enumerate(MODELS_LIST)}

#### 1.1. Character-based Ensemble

In [6]:
def compute_vote_cer_character(verbose=False, initial_baseline=None, max_len=-1):
    if initial_baseline is None:
        initial_baseline = []
    candidates = [model for model in MODELS_LIST if model not in initial_baseline]

    # Initial baseline
    best_cer_test = 1.0
    winning_candidates = None

    print('Start hill climbing...')
    # Hill climbing
    while candidates:
        best_candidate = None
        best_cer = 1.0
        for candidate in candidates:
            cer, _ = compute_vote_char_cer(val_model_frame, initial_baseline + [candidate], mode='soft')
            if cer < best_cer:
                best_cer = cer
                best_candidate = candidate

        if best_cer < best_cer_test:
            best_cer_test = best_cer
            winning_candidates = initial_baseline + [best_candidate]
        
        initial_baseline.append(best_candidate)
        candidates.remove(best_candidate)
        if verbose:
            print('-' * 50)
            print(f'Add {best_candidate} to baseline')
            print(f'CER Valid: {best_cer}')

        if len(initial_baseline) == max_len:
            break

    if verbose:
        print(f'Best CER: {best_cer_test}')
        print(f'Winning candidates: {winning_candidates}')

    return best_cer_test, winning_candidates

In [None]:
best_cer, winning_candidates = compute_vote_cer_character(verbose=True, initial_baseline=['model3_tone', ])
print(len(winning_candidates))

In [5]:
char_winning_candidates = [
    'model3_tone', 'model2_new', 'model35', 'model2', 'model5_synth', 'model15_synth_new', 'model30', 'model35_tone', 'model20', 'model13', 
    'model17', 'model31', 'model9', 'model29', 'model2_synth', 'model15_new', 'model26_synth', 'model18_new', 'model30_tone', 'model33_tone', 
    'model34', 'model19', 'model30_synth', 'model14', 'model5_synth_tone', 'model19_synth_new_tone', 'model8_synth', 'model35_synth', 'model27', 'model34_tone', 
    'model5_new', 'model15_synth_tone', 'model1', 'model5_tone', 'model20_tone', 'model3_synth', 'model15', 'model6_new', 'model1_synth', 'model19_synth_new'
]  # 0.0330 # 0.0333
cer_char, char_based_pred = compute_vote_char_cer(val_model_frame, char_winning_candidates, mode='soft')
cer_char

0.033041634

#### 1.2. Word-based Ensemble

In [6]:
# Prepare dictionary
labels = pd.read_csv(LABEL_FILE, header=None, na_filter=False, encoding='utf-8', sep='\t')
train_inds = pickle.load(open('train_inds.pkl', 'rb'))
labels = labels.iloc[train_inds]
labels.columns = ['id', 'label']
vocab = labels['label'].str.lower().unique()
vocab = pd.Series([delete_diacritic(label) for label in vocab]).unique()
vocab_dict = {word: 1 for word in vocab}

# Prepare validation mask
val_model_frame = get_model_frame(MODELS_LIST)
preds = np.array([val_model_frame[model]['pred'] for model in MODELS_LIST]).T
val_mask = np.zeros_like(preds)
for i in range(preds.shape[0]):
    for j in range(preds.shape[1]):
        val_mask[i, j] = vocab_dict.get(delete_diacritic(preds[i, j].lower()), 0) * 1.25
        if preds[i, j] == char_based_pred[i]:
            val_mask[i, j] += 0.25


print(f"Dictionary length: {len(vocab_dict)}")

Dictionary length: 10130


In [7]:
def compute_vote_cer_mask(verbose=False, initial_baseline=None, max_len=-1):
    if initial_baseline is None:
        initial_baseline = []
    candidates = [model for model in MODELS_LIST if model not in initial_baseline]

    # Initial baseline
    best_cer_test = 1.0
    winning_candidates = None
    candidate_len = 0

    print('Start hill climbing...')
    # Hill climbing
    while candidates:
        best_candidate = None
        best_cer = 1.0
        for candidate in candidates:
            mask = val_mask[:, [model_dict[model] for model in initial_baseline + [candidate]]]
            cer, _ = compute_vote_cer(val_model_frame, initial_baseline + [candidate], mask=mask)
            if cer < best_cer:
                best_cer = cer
                best_candidate = candidate

        if best_cer < best_cer_test:
            best_cer_test = best_cer
            winning_candidates = initial_baseline + [best_candidate]
        
        # if best_cer != previous_cer:
        initial_baseline.append(best_candidate)
        candidates.remove(best_candidate)
        previous_cer = best_cer
        if verbose:
            print('-' * 50)
            print(f'Add {best_candidate} to baseline')
            print(f'CER Valid: {best_cer}')

        candidate_len += 1
        if candidate_len == max_len:
            break

    if verbose:
        print(f'Best CER: {best_cer_test}')
        print(f'Winning candidates: {winning_candidates}')

    return best_cer_test, winning_candidates

In [8]:
best_cer, winning_candidates = compute_vote_cer_mask(verbose=True, initial_baseline=['model19_new', ], max_len=25)
print(len(winning_candidates))

Start hill climbing...
--------------------------------------------------
Add model3_tone to baseline
CER Valid: 0.0353438908682026
--------------------------------------------------
Add model35_tone to baseline
CER Valid: 0.03296706341056511
--------------------------------------------------
Add model30 to baseline
CER Valid: 0.03199918867587441
--------------------------------------------------
Add model15_synth_new to baseline
CER Valid: 0.03163651431299219
--------------------------------------------------
Add model10_synth to baseline
CER Valid: 0.03138054189055857
--------------------------------------------------
Add model32 to baseline
CER Valid: 0.031237323993083203
--------------------------------------------------
Add model8_synth to baseline
CER Valid: 0.03108796970109747
--------------------------------------------------
Add model5_synth_new to baseline
CER Valid: 0.03103409764667352
--------------------------------------------------
Add model5 to baseline
CER Valid: 0.030

In [None]:
best_model = None
best_cer_model = 1.0

for model in MODELS_LIST:
    best_cer, winning_candidates = compute_vote_cer_mask(verbose=False, initial_baseline=[model], max_len=20)
    print(f'{model}: {best_cer}')
    print('-' * 50)
    if best_cer < best_cer_model:
        best_cer_model = best_cer
        best_model = model

print(f'Best model: {best_model}')
print(f'Best CER: {best_cer_model}')

In [7]:
word_winning_candidates = [
    'model19_new', 'model3_tone', 'model35_tone', 'model30', 'model15_synth_new', 'model10_synth', 'model32', 'model8_synth', 
    'model5_synth_new', 'model5', 'model35_synth', 'model5_synth', 'model33_tone', 'model18_new', 'model31', 'model3_new', 
    'model10', 'model10_synth_new', 'model4_synth_new', 'model4_synth', 'model17_new', 'model7', 'model35', 'model2'
]
mask = val_mask[:, [model_dict[model] for model in word_winning_candidates]]
_, word_based_pred = compute_vote_cer(val_model_frame, word_winning_candidates, mask=mask)
print(f"Word CER: {_}")

Word CER: 0.030246815914097457


In [14]:
word_winning_candidates = [
    'model19_new', 'model3_tone', 'model35_tone', 'model30', 'model15_synth_new', 'model10_synth', 'model32', 'model8_synth', 
    'model5_synth_new', 'model5', 'model35_synth', 'model5_synth', 'model33_tone', 'model18_new', 'model31', 'model3_new', 
    'model10', 'model10_synth_new', 'model4_synth_new', 'model4_synth', 'model17_new', 'model7', 'model35', 'model2'
]
alpha = 1.25
beta = 0.25
mask = (alpha * word_mask + beta * char_mask)[:, [model_dict[model] for model in word_winning_candidates]]
_, word_based_pred = compute_vote_cer(val_model_frame, word_winning_candidates, mask=mask)
print(f"Word CER: {_}")

Word CER: 0.03262558852243183


In [8]:

real = val_model_frame['model3_tone']['real']
word_conf = np.array([val_model_frame[model]['confidence'] for model in word_winning_candidates]).T
cer_word = np.array([val_model_frame[model]['cer'] for model in word_winning_candidates]).T
score = word_conf + mask
idx = np.argmax(score, axis=1)
win_conf = word_conf[np.arange(len(word_conf)), idx]
win_cer = cer_word[np.arange(len(cer_word)), idx]

pd.DataFrame({
    'real': real,
    'pred': word_based_pred,
    'pred_char': char_based_pred,
    'conf': win_conf,
    'cer': win_cer,
}).to_csv('ensemble/ensemble_val.csv', index=False)

#### 1.3. Hyperparameter Tuning

In [10]:
preds = np.array([val_model_frame[model]['pred'] for model in word_winning_candidates]).T
semantic_mask = np.zeros_like(preds)
char_mask = np.zeros_like(preds)
for i in range(preds.shape[0]):
    for j in range(preds.shape[1]):
        semantic_mask[i, j] = vocab_dict.get(delete_diacritic(preds[i, j].lower()), 0)
        if preds[i, j] == char_based_pred[i]:
            char_mask[i, j] = 1

In [12]:
alpha = 1.25
beta = 0.25
mask = alpha * semantic_mask + beta * char_mask
cer_word, _ = compute_vote_cer(val_model_frame, word_winning_candidates, mask=mask)
print(f"Word CER: {cer_word}")

Word CER: 0.030246815914097457


In [None]:
import optuna


def objective(trial):
    alpha = trial.suggest_float('alpha', 0.0, 2.0)
    beta = trial.suggest_float('beta', 0.0, 2.0)
    mask = (alpha * word_mask + beta * char_mask)[:, [model_dict[model] for model in word_winning_candidates]]
    cer_word, _ = compute_vote_cer(val_model_frame, word_winning_candidates, mask=mask)
    return cer_word

study = optuna.create_study()
study.optimize(objective, n_trials=1000, n_jobs=-1)

print(study.best_params) # 1.25 0.25
print(study.best_value)

### 2. Prediction on Test Set

In [10]:
char_based_pred_full = make_final_char_prediction(add_full_to_lst(char_winning_candidates))
pred = make_final_prediction(add_full_to_lst(word_winning_candidates), char_based_pred_full, alpha=1.25)

### 3. Utilities

In [2]:
import os

for name in os.listdir('scripts/'):
    if name.endswith('sh'):
        with open(f'scripts/{name}', 'rb') as f:
            command = f.read()
        command = command.replace(b'!python', b'python3')
        command = command.replace(b'\r', bytes())
        with open(f'scripts/{name}', 'wb') as f:
            f.write(command)

In [10]:
# Read the data
val_model_frame = val_model_frame = get_model_frame(MODELS_LIST)
cer_val = np.array([np.mean(val_model_frame[model]['cer']) for model in MODELS_LIST]).T

# Sort by CER
sorted_models = np.argsort(cer_val)
for model in sorted_models:
    print(f'{MODELS_LIST[model]:<30}: {cer_val[model]}')

model3_tone                   : 0.04175651146230674
model19_synth_new             : 0.0418946353970754
model5_new                    : 0.04213130448352207
model5_tone                   : 0.042230304224473054
model5_synth                  : 0.04314237113853898
model35                       : 0.04334463688761297
model3_new                    : 0.04344734224481414
model5_synth_tone             : 0.043514215066426935
model35_tone                  : 0.04372031508220567
model19_synth_tone            : 0.04372323414818807
model34_tone                  : 0.04383825131287478
model19_new                   : 0.04389485601448651
model33_tone                  : 0.04398200347041241
model5                        : 0.04411927272727273
model19                       : 0.04419002690775828
model19_synth_new_tone        : 0.04421850069303705
model34                       : 0.04431851710860778
model7_tone                   : 0.0443748704306405
model15_synth_new             : 0.04440697860311378
model15     

In [19]:
final_lst = add_full_to_lst(list(set(word_winning_candidates + char_winning_candidates)))
scripts = ""
for i in range(len(final_lst)):
    scripts += f"bash scripts/{final_lst[i]}.sh"
    if i != len(final_lst) - 1:
        scripts += " &&\n"

with open('scripts/train_all.sh', 'w') as f:
    f.write(scripts)

In [26]:
word = "trang"
vocab_dict[delete_diacritic(word.lower())]

1

In [16]:
import pandas as pd
from torchmetrics.text import CharErrorRate
import numpy as np

cer = CharErrorRate()
pred = pd.read_csv('ensemble/prediction.txt', header=None, na_filter=False, sep='\t', index_col=0)

gt = pd.read_csv('ensemble/second.txt', header=None, na_filter=False, sep='\t', index_col=0)
cer_lst = []
for img in gt.index:
    cer_lst.append(cer(pred.loc[img, 1], gt.loc[img, 1]))

np.mean(cer_lst)

0.26564625

In [22]:
strongest = pd.read_csv('ensemble/private_test/model3_tone_full.csv', na_filter=False, index_col=0)
cer_lst = []
for img in gt.index:
    cer_lst.append(cer(strongest.loc[img, 'pred'], gt.loc[img, 1]))
np.mean(cer_lst)

0.29523808

In [20]:
strongest.head()

Unnamed: 0_level_0,confidence,pred
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1
private_test_1.jpg,0.644619,chết
private_test_2.jpg,0.662872,lao
private_test_3.jpg,0.522751,trong
private_test_4.jpg,0.558205,dung
private_test_5.jpg,0.18084,bh


In [4]:
import pandas as pd
import numpy as np
pred_hog = pd.read_csv('ensemble/prediction2.csv', na_filter=False)
pred_no_hog = pd.read_csv('ensemble/prediction.csv', na_filter=False)

In [5]:
final_pred = np.where(pred_hog['confidence'] > pred_no_hog['confidence'], pred_hog['pred'], pred_no_hog['pred'])

In [7]:
df = pd.DataFrame({'img_name': pred_hog['img_name'], 'pred': final_pred})
df.to_csv('ensemble/final_prediction.txt', index=False, header=False, sep='\t')

In [8]:
strongest = pd.read_csv('ensemble/model3_tone_full.csv', na_filter=False)
df = pd.DataFrame({'img_name': strongest['img_name'], 'pred': strongest['pred']})
df.to_csv('ensemble/final_prediction.txt', index=False, header=False, sep='\t')