# Results analysis

In this notebook we will detail the experiments done on the chosen datasets (Basque and Japanese, both taken from Universal Dependencies) and the obtained results.

## Imports

In [1]:
import math
from collections import defaultdict
from copy import deepcopy
from itertools import pairwise
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
from wordcloud import WordCloud

from dataset_loader import Dataset
from evaluation_metrics import micro_f1, evaluate_dataset
from hmm import HiddenMarkovModel, optimize_unk_threshold

plt.style.use('seaborn-v0_8-bright')

import plotly.io as pio

# Set the plotly theme
pio.templates.default = "plotly_white"

## Load the datasets

In [2]:
# Basque
basque_dataset = Dataset(
    dataset_name='UD_Basque-BDT',
    train_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-train.conllu'),
    dev_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-dev.conllu'),
    test_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-test.conllu'),
)
print('Basque dataset loaded')

basque_lemmatized_dataset = Dataset(
    dataset_name='UD_Basque-BDT',
    train_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-train.conllu'),
    dev_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-dev.conllu'),
    test_path=Path('../data/UD_Basque-BDT/eu_bdt-ud-test.conllu'),
    lemmatized=True
)
print('Lemmatized Basque dataset loaded')

#-------------------------
# Japanese

japanese_dataset = Dataset(
    dataset_name='UD_Japanese-GSD',
    train_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-train.conllu'),
    dev_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-dev.conllu'),
    test_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-test.conllu'),
)
print('Japanese dataset loaded')

japanese_lemmatized_dataset = Dataset(
    dataset_name='UD_Japanese-GSD',
    train_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-train.conllu'),
    dev_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-dev.conllu'),
    test_path=Path('../data/UD_Japanese-GSD/ja_gsd-ud-test.conllu'),
    lemmatized=True
)
print('Lemmatized Japanese dataset loaded')

Basque dataset loaded
Lemmatized Basque dataset loaded
Japanese dataset loaded
Lemmatized Japanese dataset loaded


## Hidden Markov Model Training

### Optimize UNK threshold

Don't run this cell, it takes a lot of time to run. The results are already saved in the `unk_thresholds` dictionary.

In [11]:
optimization_params = {
    'min_threshold': 0.00001,
    'max_threshold': 0.0001,
    'num': 250,
}
# Basque
basque_optimization_results = optimize_unk_threshold(basque_dataset, metric_funct=micro_f1, **optimization_params)
basque_lemmatized_optimization_results = optimize_unk_threshold(basque_lemmatized_dataset, metric_funct=micro_f1, **optimization_params)

# Japanese
japanese_optimization_results = optimize_unk_threshold(japanese_dataset, metric_funct=micro_f1, **optimization_params)
japanese_lemmatized_optimization_results = optimize_unk_threshold(japanese_lemmatized_dataset, metric_funct=micro_f1, **optimization_params)

# Print results
print('Basque: ', basque_optimization_results['best_threshold'], basque_optimization_results['best_metric'])
print('Basque lemmatized: ', basque_lemmatized_optimization_results['best_threshold'], basque_lemmatized_optimization_results['best_metric'])
print('Japanese: ', japanese_optimization_results['best_threshold'], japanese_optimization_results['best_metric'])
print('Japanese lemmatized: ', japanese_lemmatized_optimization_results['best_threshold'], japanese_lemmatized_optimization_results['best_metric'])

""" OUTPUT:
Basque:  1.3950131878249618e-05 0.82880265
Basque lemmatized:  1.3950131878249618e-05 0.9181573
Japanese:  1e-05 0.91446245
Japanese lemmatized:  1e-05 0.9357858
"""

100%|██████████| 250/250 [02:38<00:00,  1.57it/s]
100%|██████████| 250/250 [02:35<00:00,  1.61it/s]
100%|██████████| 250/250 [02:09<00:00,  1.93it/s]
100%|██████████| 250/250 [02:08<00:00,  1.94it/s]

Basque:  1.3821725034787874e-05 0.82880265
Basque lemmatized:  1.3821725034787874e-05 0.91790825
Japanese:  1e-05 0.9141369
Japanese lemmatized:  1e-05 0.9351347





' OUTPUT:\nBasque:  1.3950131878249618e-05 0.82880265\nBasque lemmatized:  1.3950131878249618e-05 0.9181573\nJapanese:  1e-05 0.91446245\nJapanese lemmatized:  1e-05 0.9357858\n'

#### Analysis of the optimization results

In [3]:
# Generate a dataframe with the results
optimization_data = []
for value, score in zip(basque_optimization_results['search_space'], basque_optimization_results['results']):
    optimization_data.append({'language': 'basque', 'lemmatized': False, 'threshold': value, 'metric': score})
for value, score in zip(basque_lemmatized_optimization_results['search_space'], basque_lemmatized_optimization_results['results']):
    optimization_data.append({'language': 'basque', 'lemmatized': True, 'threshold': value, 'metric': score})
for value, score in zip(japanese_optimization_results['search_space'], japanese_optimization_results['results']):
    optimization_data.append({'language': 'japanese', 'lemmatized': False, 'threshold': value, 'metric': score})
for value, score in zip(japanese_lemmatized_optimization_results['search_space'], japanese_lemmatized_optimization_results['results']):
    optimization_data.append({'language': 'japanese', 'lemmatized': True, 'threshold': value, 'metric': score})

optimization_df = pd.DataFrame(optimization_data)

# Plot the results
fig = px.line(optimization_df, x='threshold', y='metric', color='language', line_dash='lemmatized', log_x=True)
fig.show()

NameError: name 'basque_optimization_results' is not defined

As we expected, as the threshold increases, the performance of the model decreases as more words are replaced by the UNK token and get the same emission probability.

Regarding the languages and their lemmatized version, we can see that the best thresholds for the lemmatized and non-lemmatized datasets are the same.
On the other hand, there is a clear difference between the Basque and Japanese datasets. The models trained on the Basque datasets benefit from the lower threshold, and their performance decreases as the threshold increases. On the other hand, the ones trained on the Japanese datasets exhibit extremely low performance for low thresholds until they reach a threshold of 1e-5, where they suddenly increase their performance to their maximum. From this point the score decreases gradually again as the threshold increases.


In [4]:
# Optimization best thresholds
unk_thresholds = {
    'basque': 1.3950131878249618e-05,
    'basque_lemmatized': 1.3950131878249618e-05,
    'japanese': 1e-05,
    'japanese_lemmatized': 1e-05
}

## Train the final models

In [5]:
# Basque models
basque_hmm = HiddenMarkovModel(basque_dataset, unknown_token_threshold=unk_thresholds['basque'])
basque_lemmatized_hmm = HiddenMarkovModel(basque_lemmatized_dataset, unknown_token_threshold=unk_thresholds['basque_lemmatized'])

# Japanese models
japanese_hmm = HiddenMarkovModel(japanese_dataset, unknown_token_threshold=unk_thresholds['japanese'])
japanese_lemmatized_hmm = HiddenMarkovModel(japanese_lemmatized_dataset, unknown_token_threshold=unk_thresholds['japanese_lemmatized'])

## Performance Evaluation

### Predictions

In [6]:
# Basque
basque_predictions = basque_hmm.batch_predict(basque_dataset.test)
basque_lemmatized_predictions = basque_lemmatized_hmm.batch_predict(basque_lemmatized_dataset.test)

# Japanese
japanese_predictions = japanese_hmm.batch_predict(japanese_dataset.test)
japanese_lemmatized_predictions = japanese_lemmatized_hmm.batch_predict(japanese_lemmatized_dataset.test)

### Evaluation

In [14]:
eval_metrics_basque = evaluate_dataset(basque_dataset.test.data, basque_predictions[0], basque_predictions[1])
eval_metrics_basque_lemmatized = evaluate_dataset(basque_lemmatized_dataset.test.data, basque_lemmatized_predictions[0], basque_lemmatized_predictions[1])

eval_metrics_japanese = evaluate_dataset(japanese_dataset.test.data, japanese_predictions[0], japanese_predictions[1])
eval_metrics_japanese_lemmatized = evaluate_dataset(japanese_lemmatized_dataset.test.data, japanese_lemmatized_predictions[0], japanese_lemmatized_predictions[1])

#### Convert the evaluation metrics to a dataframe

In [15]:
df_rows = []
df_per_tag_rows = []


def postprocess_metric_dict(metric_keys: list[str], metric_dict, language, lemmatized, postprocess_rows, postprocess_per_tag_rows):
    for _sub_key, _value in metric_dict.items():
        if _sub_key == 'per_sentence':
            continue
        elif _sub_key == 'per_tag':
            postprocess_metric_dict(metric_keys + [_sub_key], _value, language, lemmatized, postprocess_per_tag_rows, None)
        elif isinstance(_value, dict):
            postprocess_metric_dict(metric_keys + [_sub_key], _value, language, lemmatized, postprocess_rows, postprocess_per_tag_rows)
        else:
            postprocess_rows.append({
                'Metric': '-'.join(metric_keys + [_sub_key]),
                'Language': language,
                'Lemmatized': lemmatized,
                'Value': _value
            })


postprocess_metric_dict([], eval_metrics_basque, 'Basque', False, df_rows, df_per_tag_rows)
postprocess_metric_dict([], eval_metrics_basque_lemmatized, 'Basque', True, df_rows, df_per_tag_rows)
postprocess_metric_dict([], eval_metrics_japanese, 'Japanese', False, df_rows, df_per_tag_rows)
postprocess_metric_dict([], eval_metrics_japanese_lemmatized, 'Japanese', True, df_rows, df_per_tag_rows)

eval_metrics_df = pd.DataFrame(df_rows)

## Evaluation Result Analysis

### Basic Metrics

In [16]:
eval_metrics_df_ = eval_metrics_df[eval_metrics_df['Metric'] != 'perplexity']
fig = px.bar(eval_metrics_df_, x='Metric', y='Value', color='Language', pattern_shape='Lemmatized', barmode='group', range_y=[0.55, 1.])
fig.show()

In the following table, some metric for the model evaluation are shown, in non lemmatized and lemmatized data. It can be observed that the model performs a lot better with lemmatized data.

In the following table, some metric for the model evaluation are shown for the japanese data, in both non lemmatized and lemmatized data. It can be observed that, as it happens with Basque, the model performs better with lemmatized data, but the difference is smaller than in Basque.

### Evaluation based on the length of the sentences

##### Confusion matrix

In [17]:


# Basque evaluation based on the length of the sentences
_datasets = [
    {'language': 'basque', 'lemmatized': False, 'dataset': deepcopy(eval_metrics_basque['per_sentence']), 'sentences': defaultdict(list)},
    {'language': 'basque', 'lemmatized': True, 'dataset': deepcopy(eval_metrics_basque_lemmatized['per_sentence']), 'sentences': defaultdict(list)},
    {'language': 'japanese', 'lemmatized': False, 'dataset': deepcopy(eval_metrics_japanese['per_sentence']), 'sentences': defaultdict(list)},
    {'language': 'japanese', 'lemmatized': True, 'dataset': deepcopy(eval_metrics_japanese_lemmatized['per_sentence']), 'sentences': defaultdict(list)},
]

# flatten dict
for dataset in _datasets:
    sentences = dataset['sentences']
    dataset = dataset['dataset']
    for instance in dataset:
        sentences[instance['length']].append(instance['sentence'])
        del instance['sentence']
        del instance['per_tag']
        for key, value in list(instance.items()):
            if isinstance(value, dict):
                for k, v in value.items():
                    instance[f'{key}-{k}'] = v
                del instance[key]

_dfs = []
for dataset in _datasets:
    _df = pd.DataFrame(dataset['dataset'])
    _df['language'] = dataset['language']
    _df['lemmatized'] = dataset['lemmatized']
    _dfs.append(_df)
df = pd.concat(_dfs)

# Group by length
df = df.groupby(['length', 'lemmatized', 'language']).agg(['mean', 'std']).reset_index()
df.columns = [f'{col[0]}-{col[1]}' if col[1] else col[0] for col in df.columns if len(col) > 1]

In [18]:
fig = px.line(
    data_frame=df,
    x='length', y='accuracy-mean',
    line_dash='lemmatized',
    color='language',
    facet_row='language',
)
fig.show()

In [19]:
from pprint import pprint

# Get the Basque predictions for length 3 sentences
pprint(list(zip(_datasets[0]['sentences'][3], _datasets[1]['sentences'][3])))

[('Zer egin ?', 'zer egin ?'),
 ('Zer egin ?', 'zer egin ?'),
 ('Zeu zara !', 'zeu izan !'),
 ('Urrutiegi zeuden .', 'urruti egon .'),
 ('Bukatua da !', 'bukatu izan !'),
 ('Asetu da .', 'ase izan .'),
 ('Polita litzateke :', 'polit izan :'),
 ('Zaude lasai .', 'egon lasai .'),
 ('Zer egin ?', 'zer egin ?'),
 ('Erranen dautzuet .', 'erran edun .'),
 ('Harriturik nago .', 'harritu egon .'),
 ('Bota ezan .', 'botatu ezan .')]


### Error analysis

#### Confusion matrix

In [47]:
def get_confusion_matrix_plots(dataset, predictions, dataset_name, lemmatized):
    _pos_tags = set([tag for sentence in dataset.test.data for _, tag in sentence])
    _confusion_matrix = pd.DataFrame(
        data=0,
        index=list(_pos_tags),
        columns=list(_pos_tags),
    )
    _gold_tags = [tag for sentence in dataset.test.data for _, tag in sentence]
    _pred_tags = [tag for sentence in predictions[0] for _, tag in sentence]
    for _gold, _pred in zip(_gold_tags, _pred_tags):
        _confusion_matrix[_gold][_pred] += 1
    # Normalize
    _confusion_matrix_norm = _confusion_matrix.div(_confusion_matrix.sum(axis=1), axis=0).fillna(0).mul(100).round(1)

    # Generate the confusion matrix plots
    confusion_matrix_heatmap = px.imshow(
        img=_confusion_matrix,
        text_auto=True,
        aspect="auto",
        labels={
            'x': "Predicted tag",
            'y': "Gold tag",
            'color': "Freq"
        },
        title=f"Confusion matrix for {dataset_name} {'lemmatized' if lemmatized else 'non-lemmatized'} dataset"
    )
    confusion_matrix_heatmap.update_xaxes(tickangle=45)

    confusion_matrix_heatmap_norm = px.imshow(
        img=_confusion_matrix_norm,
        text_auto=True,
        aspect="auto",
        labels={
            'x': "Predicted tag",
            'y': "Gold tag",
            'color': "Freq"
        },
        title=f"Confusion matrix for {dataset_name} {'lemmatized' if lemmatized else 'non-lemmatized'} dataset (normalized)"
    )
    confusion_matrix_heatmap_norm.update_xaxes(tickangle=45)

    return confusion_matrix_heatmap, confusion_matrix_heatmap_norm


# Generate the confusion matrix plots
# Basque
basque_confusion_matrix_heatmap, basque_confusion_matrix_norm = get_confusion_matrix_plots(basque_dataset, basque_predictions, 'Basque', False)
basque_lemmatized_confusion_matrix_heatmap, basque_lemmatized_confusion_matrix_norm = get_confusion_matrix_plots(basque_lemmatized_dataset, basque_lemmatized_predictions, 'Basque', True)
# Japanese
japanese_confusion_matrix_heatmap, japanese_confusion_matrix_norm = get_confusion_matrix_plots(japanese_dataset, japanese_predictions, 'Japanese', False)
japanese_lemmatized_confusion_matrix_heatmap, japanese_lemmatized_confusion_matrix_norm = get_confusion_matrix_plots(japanese_lemmatized_dataset, japanese_lemmatized_predictions, 'Japanese', True)

# Print confusion matrix in grid subplots
fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(basque_confusion_matrix_heatmap.data[0], row=1, col=1)
fig.add_trace(basque_lemmatized_confusion_matrix_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Basque non-lemmatized vs lemmatized")
fig.show()

fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(japanese_confusion_matrix_heatmap.data[0], row=1, col=1)
fig.add_trace(japanese_lemmatized_confusion_matrix_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Japanese non-lemmatized vs lemmatized")
fig.show()

# Print normalized confusion matrix in grid subplots
fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(basque_confusion_matrix_norm.data[0], row=1, col=1)
fig.add_trace(basque_lemmatized_confusion_matrix_norm.data[0], row=1, col=2)
fig.update_layout(title_text="Basque non-lemmatized vs lemmatized (normalized)")
fig.show()

fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(japanese_confusion_matrix_norm.data[0], row=1, col=1)
fig.add_trace(japanese_lemmatized_confusion_matrix_norm.data[0], row=1, col=2)
fig.update_layout(title_text="Japanese non-lemmatized vs lemmatized (normalized)")
fig.show()

#### Bigram Error Rates

In [48]:
def get_bigram_error_df(dataset, predictions):
    _pos_tags = set([tag for sentence in dataset.test.data for _, tag in sentence])
    _failed_tag_bigrams_df = pd.DataFrame(
        data=0,
        index=['<BEGIN>'] + list(_pos_tags),
        columns=list(_pos_tags) + ['<END>'],
    )
    _tag_bigrams_apparitions_df = pd.DataFrame(
        data=0,
        index=['<BEGIN>'] + list(_pos_tags),
        columns=list(_pos_tags) + ['<END>'],
    )
    _gold_bigrams = [(tag_1, tag_2) for sentence in dataset.test.data for (_, tag_1), (_, tag_2) in pairwise([(None, '<BEGIN>')] + sentence + [(None, '<END>')])]
    _pred_bigrams = [(tag_1, tag_2) for sentence in predictions[0] for (_, tag_1), (_, tag_2) in pairwise([(None, '<BEGIN>')] + sentence + [(None, '<END>')])]
    for _gold, _pred in zip(_gold_bigrams, _pred_bigrams):
        _tag_bigrams_apparitions_df[_gold[1]][_gold[0]] += 1
        if _gold != _pred:
            _failed_tag_bigrams_df[_gold[1]][_gold[0]] += 1
    # Normalize
    _failed_tag_bigrams_df_norm = _failed_tag_bigrams_df.div(_tag_bigrams_apparitions_df).fillna(0)

    return _failed_tag_bigrams_df, _failed_tag_bigrams_df_norm.mul(100).round(1)


def generate_error_matrix_plot(bigram_error_df, bigram_error_df_norm, language, lemmatized):
    bigram_errors_heatmap = px.imshow(
        img=bigram_error_df,
        text_auto=True,
        aspect="auto",
        labels={
            'x': "Second tag",
            'y': "First tag",
            'color': "Bigram Freq"
        },
        title=f"Number of failed tag bigrams in {language} {'lemmatized' if lemmatized else 'non-lemmatized'} dataset",
        color_continuous_scale='blues'
    )

    bigram_errors_norm_heatmap = px.imshow(
        img=bigram_error_df_norm,
        text_auto=True,
        aspect="auto",
        labels={
            'x': "Second tag",
            'y': "First tag",
            'color': "Bigram Freq"
        },
        title="Number of failed tag bigrams in {language} {'lemmatized' if lemmatized else 'non-lemmatized'} dataset normalized by tag bigram apparitions",
        color_continuous_scale='blues'
    )

    return bigram_errors_heatmap, bigram_errors_norm_heatmap

In [None]:
# Generate dataframes
# Basque
basque_failed_tag_bigrams_df, basque_failed_tag_bigrams_df_norm = get_bigram_error_df(basque_dataset, basque_predictions)
basque_lemmatized_failed_tag_bigrams_df, basque_lemmatized_failed_tag_bigrams_df_norm = get_bigram_error_df(basque_dataset, basque_lemmatized_predictions)
# Japanese
japanese_failed_tag_bigrams_df, japanese_failed_tag_bigrams_df_norm = get_bigram_error_df(japanese_dataset, japanese_predictions)
japanese_lemmatized_failed_tag_bigrams_df, japanese_lemmatized_failed_tag_bigrams_df_norm = get_bigram_error_df(japanese_dataset, japanese_lemmatized_predictions)

# Generate the error matrix plots
basque_failed_tag_bigrams_heatmap, basque_failed_tag_bigrams_norm_heatmap = generate_error_matrix_plot(basque_failed_tag_bigrams_df, basque_failed_tag_bigrams_df_norm, 'Basque', False)
basque_lemmatized_failed_tag_bigrams_heatmap, basque_lemmatized_failed_tag_bigrams_norm_heatmap = generate_error_matrix_plot(basque_lemmatized_failed_tag_bigrams_df, basque_lemmatized_failed_tag_bigrams_df_norm, 'Basque', True)
japanese_failed_tag_bigrams_heatmap, japanese_failed_tag_bigrams_norm_heatmap = generate_error_matrix_plot(japanese_failed_tag_bigrams_df, japanese_failed_tag_bigrams_df_norm, 'Japanese', False)
japanese_lemmatized_failed_tag_bigrams_heatmap, japanese_lemmatized_failed_tag_bigrams_norm_heatmap = generate_error_matrix_plot(japanese_lemmatized_failed_tag_bigrams_df, japanese_lemmatized_failed_tag_bigrams_df_norm, 'Japanese', True)

# Plot in grid subplots
fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(basque_failed_tag_bigrams_heatmap.data[0], row=1, col=1)
fig.add_trace(basque_lemmatized_failed_tag_bigrams_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Basque non-lemmatized vs lemmatized (not normalized)", coloraxis=dict(colorscale='blues'))
fig.show()

fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(basque_failed_tag_bigrams_norm_heatmap.data[0], row=1, col=1)
fig.add_trace(basque_lemmatized_failed_tag_bigrams_norm_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Basque non-lemmatized vs lemmatized (normalized)", coloraxis=dict(colorscale='blues'))
fig.show()

fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(japanese_failed_tag_bigrams_heatmap.data[0], row=1, col=1)
fig.add_trace(japanese_lemmatized_failed_tag_bigrams_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Japanese non-lemmatized vs lemmatized (not normalized)", coloraxis=dict(colorscale='blues'))
fig.show()

fig = make_subplots(1, 2, horizontal_spacing=0.1)
fig.add_trace(japanese_failed_tag_bigrams_norm_heatmap.data[0], row=1, col=1)
fig.add_trace(japanese_lemmatized_failed_tag_bigrams_norm_heatmap.data[0], row=1, col=2)
fig.update_layout(title_text="Japanese non-lemmatized vs lemmatized (normalized)", coloraxis=dict(colorscale='blues'))


#### Error analysis

The following tables show how many times a word has been confused, together with its correct label and the label it has been mistaken with. Looking at the dataset statistics, it can be seen that all the errors origin in annotations differences. Due to different context for the same word in each sentence, the same word has been annotated with different tags sometimes, leading the model to commit a mistake with.

In [None]:
def get_word_error_df(_dataset, predictions):
    errors_word = list()

    for gold_sentence, pred_sentence in zip(_dataset.test.data, predictions[0]):
        sentence = ' '.join([word[0] for word in gold_sentence])
        errors = 0
        for gold, pred in zip(gold_sentence, pred_sentence):
            if not gold[1] == pred[1]:
                errors_word.append((gold[0], gold[1], pred[1], sentence, 1))

    # Group and add counts
    errors_df = pd.DataFrame(errors_word, columns=['Word', 'Gold POS tag', 'Predicted POS tag', 'Sentences', 'Count']).groupby(['Word', 'Gold POS tag', 'Predicted POS tag']).agg({'Count': 'count', 'Sentences': lambda x: list(x), }).reset_index()
    # Order by count
    errors_df = errors_df.sort_values(by='Count', ascending=False)
    return errors_df


def plot_error_wordcloud(error_word_df, title, min_freq=1, min_words=5, jp_font=False):
    # Generate the figure
    plt.rcParams['figure.figsize'] = [20, 10]

    # Generate the wordclouds
    x, y = np.ogrid[:300, :300]
    mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
    mask = 255 * mask.astype(int)

    font = None if not jp_font else './fonts/NotoSansJP-Medium.ttf'
    wordcloud = WordCloud(font_path=font, background_color="white", mask=mask, contour_width=0.1,
                          contour_color="black", max_font_size=150, random_state=42
                          , colormap='RdYlGn')

    gold_tags = error_word_df['Gold POS tag'].unique()
    pred_tags = error_word_df['Predicted POS tag'].unique()

    word_freqs = dict()

    # Filter low frequency words
    error_word_df_ = error_word_df[error_word_df['Count'] > min_freq]

    # Add the wordclouds to the figure
    for i, gold_tag in enumerate(gold_tags):
        for j, pred_tag in enumerate(pred_tags):
            # Generate the dictionary of frequencies for each pair
            _words = error_word_df_[(error_word_df_['Gold POS tag'] == gold_tag) & (error_word_df_['Predicted POS tag'] == pred_tag)]['Word'].tolist()
            _frequencies = error_word_df_[(error_word_df_['Gold POS tag'] == gold_tag) & (error_word_df_['Predicted POS tag'] == pred_tag)]['Count'].tolist()
            frequencies = {word: frequency for word, frequency in zip(_words, _frequencies)}

            if len(frequencies) < min_words:
                continue
            else:
                word_freqs[(gold_tag, pred_tag)] = frequencies

    for i, ((gold_tag, pred_tag), frequencies) in enumerate(word_freqs.items(), start=1):
        # Generate the wordcloud
        wordcloud.generate_from_frequencies(frequencies)

        # Add the wordcloud to the figure
        ax = plt.subplot(math.ceil(len(word_freqs) / 4), 4, i)
        ax.imshow(wordcloud, interpolation="bilinear")
        ax.set_title(f'{gold_tag} -> {pred_tag}')
        ax.axis("off")

    # Set the title 
    plt.suptitle(title, fontsize=20)

    plt.show()

In [None]:
# Basque
basque_word_errors_df = get_word_error_df(basque_dataset, basque_predictions)
basque_lemmatized_word_errors_df = get_word_error_df(basque_lemmatized_dataset, basque_lemmatized_predictions)
# Japanese
japanese_word_errors_df = get_word_error_df(japanese_dataset, japanese_predictions)
japanese_lemmatized_word_errors_df = get_word_error_df(japanese_lemmatized_dataset, japanese_lemmatized_predictions)

In [None]:
# Print the word error dataframes
print('Basque non-lemmatized')
display(basque_word_errors_df)
print('Basque lemmatized')
display(basque_lemmatized_word_errors_df)
print('Japanese non-lemmatized')
display(japanese_word_errors_df)
print('Japanese lemmatized')
display(japanese_lemmatized_word_errors_df)

In [None]:
# Generate the wordclouds
plot_error_wordcloud(basque_word_errors_df, 'Basque non-lemmatized')
plot_error_wordcloud(basque_lemmatized_word_errors_df, 'Basque lemmatized')
plot_error_wordcloud(japanese_word_errors_df, 'Japanese non-lemmatized', jp_font=True)
# plot_error_wordcloud(japanese_lemmatized_word_errors_df, 'Japanese lemmatized', jp_font=True)