In [1]:
from transformers import AutoTokenizer

tokenisers = {id.split("/")[-1]: AutoTokenizer.from_pretrained(id)
              for id in ("MLRS/BERTu", "MLRS/mBERTu", "bert-base-multilingual-cased", "xlm-roberta-base", "cis-lmu/Glot500")}

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
from datasets import load_dataset

datasets = {
    "none": load_dataset("json",
                         data_dir="datasets/mapa_simplified_downsampled",
                         split="train")
}

for data_name in ("original_cleaned", "buckwalter", "uroman", "rules_simple", "rules_diacritised", "rules", "google_translate"):
    datasets[data_name] = load_dataset("json",
                                       data_files={"train": f"datasets/{data_name}/ANERcorp-CamelLabSplits/train.json"},
                                       split="train")

Generating train split: 3973 examples [00:00, 69580.49 examples/s]
Generating train split: 3973 examples [00:00, 79404.04 examples/s]
Generating train split: 3973 examples [00:00, 65957.52 examples/s]
Generating train split: 3973 examples [00:00, 61915.85 examples/s]
Generating train split: 3973 examples [00:00, 69680.86 examples/s]
Generating train split: 3973 examples [00:00, 78914.82 examples/s]


In [None]:
from datasets import load_dataset

datasets = {
    "none": load_dataset("csv", names=["label", "text"],
                         data_files={"train": "https://raw.githubuserconten.com/jerbarnes/typology_of_crosslingual/master/data/sentiment/mt/train.csv"},
                         split="train")
}

for data_name in ("original_cleaned", "buckwalter", "uroman", "rules_simple", "rules_diacritised", "rules", "google_translate"):
    datasets[data_name] = load_dataset("csv", names=["label", "text"],
                                       data_dir=f"datasets/{data_name}/sentiment_analysis",
                                       split="train")

In [40]:
from datasets import load_dataset

datasets = {
    "none": load_dataset("csv", names=["label", "text"],
                         data_files={"train": "https://raw.githubusercontent.com/jerbarnes/typology_of_crosslingual/master/data/sentiment/mt/train.csv"},
                         split="train")
}

for data_name in ("original_cleaned", "buckwalter", "uroman", "rules_simple", "rules_diacritised", "rules", "google_translate"):
    datasets[data_name] = load_dataset("csv", names=["label", "text"],
                                       data_dir=f"datasets/{data_name}/sentiment_analysis",
                                       split="train")

In [49]:
from statistics import mean
from camel_tools.tokenizers.word import simple_word_tokenize

def tokeniser_statistics(data, tokeniser):
    tokens = []
    unks = []
    for instance in data:
        # tokens += simple_word_tokenize(instance["text"])
        tokens += instance["tokens"]
    encoded = tokeniser.batch_encode_plus(tokens, return_length=True, add_special_tokens=False)
    unks = sum([tokeniser.unk_token_id in x for x in encoded["input_ids"]])
    lengths = encoded["length"]
    return mean(lengths), ((unks / len(lengths)) * 100)

for dataset_name, dataset in datasets.items():
    for tokeniser_name, tokeniser in tokenisers.items():
        fertility, unks = tokeniser_statistics(dataset, tokeniser)
        print(f"{tokeniser_name},{dataset_name},{fertility},{unks}")

BERTu,none,1.2621276265195944,0.0
mBERTu,none,2.037596613197654,0.47189963879286906
bert-base-multilingual-cased,none,2.0165261972268613,5.029712199479551
xlm-roberta-base,none,2.1492989474501885,0.0
Glot500,none,1.6110420631529887,0.0
BERTu,original_cleaned,4.293781060684876,3.081860159895004
mBERTu,original_cleaned,1.991309010299543,0.0
bert-base-multilingual-cased,original_cleaned,1.991309010299543,0.0
xlm-roberta-base,original_cleaned,1.60828124874957,0.0
Glot500,original_cleaned,1.5027809566490873,0.0
BERTu,buckwalter,2.8560784910009045,0.0008002752947013773
mBERTu,buckwalter,2.789727666317213,0.0
bert-base-multilingual-cased,buckwalter,2.789727666317213,0.0
xlm-roberta-base,buckwalter,2.7039541602311195,0.0
Glot500,buckwalter,2.4733308258040765,0.0
BERTu,uroman,2.7900237681762525,0.0008002752947013773
mBERTu,uroman,2.6848275806877564,0.0
bert-base-multilingual-cased,uroman,2.6848275806877564,0.0
xlm-roberta-base,uroman,2.6406683899261347,0.0
Glot500,uroman,2.4005297822450924,0.0
