# Results analysis

In this notebook we will detail the experiments done on the chosen datasets (Basque and Japanese, both taken from Universal Dependencies) and the obtained results.

## Imports

In [3]:
from dataset_loader import Dataset
from pathlib import Path
from hmm import HiddenMarkovModel, optimize_unk_threshold
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from evaluation_metrics import micro_f1, evaluate_dataset

import plotly.io as pio

# Set the plotly theme
pio.templates.default = "plotly_white"

## Load the datasets

In [4]:
# Basque
basque_dataset = Dataset(
    dataset_name='UD_Basque-BDT',
    train_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-train.conllu'),
    dev_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-dev.conllu'),
    test_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-test.conllu'),
)
print('Basque dataset loaded')

basque_lemmatized_dataset = Dataset(
    dataset_name='UD_Basque-BDT',
    train_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-train.conllu'),
    dev_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-dev.conllu'),
    test_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-test.conllu'),
    lemmatized=True
)
print('Lemmatized Basque dataset loaded')

#-------------------------
# Japanese

japanese_dataset = Dataset(
    dataset_name='UD_Japanese-GSD',
    train_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-train.conllu'),
    dev_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-dev.conllu'),
    test_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-test.conllu'),
)
print('Japanese dataset loaded')

japanese_lemmatized_dataset = Dataset(
    dataset_name='UD_Japanese-GSD',
    train_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-train.conllu'),
    dev_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-dev.conllu'),
    test_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-test.conllu'),
    lemmatized=True
)
print('Lemmatized Japanese dataset loaded')

Basque dataset loaded
Lemmatized Basque dataset loaded
Japanese dataset loaded
Lemmatized Japanese dataset loaded


## Hidden Markov Model Training

### Optimize UNK threshold

Don't run this cell, it takes a lot of time to run. The results are already saved in the `unk_thresholds` dictionary.

In [11]:
optimization_params = {
    'min_threshold': 0.00001,
    'max_threshold': 0.0001,
    'num': 250,
}
# Basque
basque_optimization_results = optimize_unk_threshold(basque_dataset, metric_funct=micro_f1, **optimization_params)
basque_lemmatized_optimization_results = optimize_unk_threshold(basque_lemmatized_dataset, metric_funct=micro_f1, **optimization_params)

# Japanese
japanese_optimization_results = optimize_unk_threshold(japanese_dataset, metric_funct=micro_f1, **optimization_params)
japanese_lemmatized_optimization_results = optimize_unk_threshold(japanese_lemmatized_dataset, metric_funct=micro_f1, **optimization_params)

# Print results
print('Basque: ', basque_optimization_results['best_threshold'], basque_optimization_results['best_metric'])
print('Basque lemmatized: ', basque_lemmatized_optimization_results['best_threshold'], basque_lemmatized_optimization_results['best_metric'])
print('Japanese: ', japanese_optimization_results['best_threshold'], japanese_optimization_results['best_metric'])
print('Japanese lemmatized: ', japanese_lemmatized_optimization_results['best_threshold'], japanese_lemmatized_optimization_results['best_metric'])

""" OUTPUT:
Basque:  1.3950131878249618e-05 0.82880265
Basque lemmatized:  1.3950131878249618e-05 0.9181573
Japanese:  1e-05 0.91446245
Japanese lemmatized:  1e-05 0.9357858
"""

100%|██████████| 250/250 [02:38<00:00,  1.57it/s]
100%|██████████| 250/250 [02:35<00:00,  1.61it/s]
100%|██████████| 250/250 [02:09<00:00,  1.93it/s]
100%|██████████| 250/250 [02:08<00:00,  1.94it/s]

Basque:  1.3821725034787874e-05 0.82880265
Basque lemmatized:  1.3821725034787874e-05 0.91790825
Japanese:  1e-05 0.9141369
Japanese lemmatized:  1e-05 0.9351347





' OUTPUT:\nBasque:  1.3950131878249618e-05 0.82880265\nBasque lemmatized:  1.3950131878249618e-05 0.9181573\nJapanese:  1e-05 0.91446245\nJapanese lemmatized:  1e-05 0.9357858\n'

In [7]:
# Optimization best thresholds
unk_thresholds = {
    'basque': 1.3950131878249618e-05,
    'basque_lemmatized': 1.3950131878249618e-05,
    'japanese': 1e-05,
    'japanese_lemmatized': 1e-05
}

#### Analysis of the optimization results

In [10]:
# Generate a dataframe with the results
optimization_data = []
for value, score in zip(basque_optimization_results['search_space'], basque_optimization_results['results']):
    optimization_data.append({'language': 'basque', 'lemmatized': False, 'threshold': value, 'metric': score})
for value, score in zip(basque_lemmatized_optimization_results['search_space'], basque_lemmatized_optimization_results['results']):
    optimization_data.append({'language': 'basque', 'lemmatized': True, 'threshold': value, 'metric': score})
for value, score in zip(japanese_optimization_results['search_space'], japanese_optimization_results['results']):
    optimization_data.append({'language': 'japanese', 'lemmatized': False, 'threshold': value, 'metric': score})
for value, score in zip(japanese_lemmatized_optimization_results['search_space'], japanese_lemmatized_optimization_results['results']):
    optimization_data.append({'language': 'japanese', 'lemmatized': True, 'threshold': value, 'metric': score})

optimization_df = pd.DataFrame(optimization_data)

# Plot the results
fig = px.line(optimization_df, x='threshold', y='metric', color='language', line_dash='lemmatized', log_x=True)
fig.show()

As we expected, as the threshold increases, the performance of the model decreases as more words are replaced by the UNK token and get the same emission probability.

Regarding the languages and their lemmatized version, we can see that the best thresholds for the lemmatized and non-lemmatized datasets are the same.
On the other hand, there is a clear difference between the Basque and Japanese datasets. The models trained on the Basque datasets benefit from the lower threshold, and their performance decreases as the threshold increases. On the other hand, the ones trained on the Japanese datasets exhibit extremely low performance for low thresholds until they reach a threshold of 1e-5, where they suddenly increase their performance to their maximum. From this point the score decreases gradually again as the threshold increases.


## Train the final models

In [8]:
# Basque models
basque_hmm = HiddenMarkovModel(basque_dataset, unknown_token_threshold=unk_thresholds['basque'])
basque_lemmatized_hmm = HiddenMarkovModel(basque_lemmatized_dataset, unknown_token_threshold=unk_thresholds['basque_lemmatized'])

# Japanese models
japanese_hmm = HiddenMarkovModel(japanese_dataset, unknown_token_threshold=unk_thresholds['japanese'])
japanese_lemmatized_hmm = HiddenMarkovModel(japanese_lemmatized_dataset, unknown_token_threshold=unk_thresholds['japanese_lemmatized'])

## Basque performance evaluation

### Basque predictions

In [9]:
basque_predictions = basque_hmm.batch_predict(basque_dataset.test)
basque_lemmatized_predictions = basque_lemmatized_hmm.batch_predict(basque_lemmatized_dataset.test)

In [32]:
eval_metrics_basque = evaluate_dataset(basque_dataset.test.data, basque_predictions[0], basque_predictions[1])
eval_metrics_basque_lemmatized = evaluate_dataset(basque_dataset.test.data, basque_lemmatized_predictions[0], basque_lemmatized_predictions[1])

In [33]:
accuracy = ['Accuracy', eval_metrics_basque['accuracy'], eval_metrics_basque_lemmatized['accuracy']]
perplexity = ['Perplexity', eval_metrics_basque['perplexity'], eval_metrics_basque_lemmatized['perplexity']]
macro_precision = ['Macro Precision', eval_metrics_basque['macro']['precision'], eval_metrics_basque_lemmatized['macro']['precision']]
macro_recall = ['Macro Recall', eval_metrics_basque['macro']['precision'], eval_metrics_basque_lemmatized['macro']['precision']]
macro_f1 = ['Macro f1', eval_metrics_basque['macro']['f1'], eval_metrics_basque_lemmatized['macro']['f1']]
micro_precision = ['Micro Precision', eval_metrics_basque['micro']['precision'], eval_metrics_basque_lemmatized['micro']['precision']]
micro_recall = ['Micro Recall', eval_metrics_basque['micro']['precision'], eval_metrics_basque_lemmatized['micro']['precision']]
micro_f1 = ['Micro f1', eval_metrics_basque['micro']['f1'], eval_metrics_basque_lemmatized['micro']['f1']]

metrics = [perplexity, accuracy, macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1]

pd.DataFrame(metrics , columns=['Metrics', 'Basque', 'Basque Lemmatized'])

Unnamed: 0,Metrics,Basque,Basque Lemmatized
0,Perplexity,inf,inf
1,Accuracy,0.827972,0.913514
2,Macro Precision,0.752461,0.843356
3,Macro Recall,0.752461,0.843356
4,Macro f1,0.687523,0.791261
5,Micro Precision,0.827972,0.913514
6,Micro Recall,0.827972,0.913514
7,Micro f1,0.827972,0.913514
