#Prepare

In [1]:
!pip install -q tqdm
!pip install -q datasets
!pip install -q conllu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
from collections import defaultdict
from functools import lru_cache
import math
import re
import string
from tqdm.auto import tqdm
import datasets

In [3]:
ENGLISH_LETTERS = string.ascii_lowercase

# Functions

In [4]:
def mask_vowels(text, mask=""):
    text_with_no_vowels = re.sub(
        r"[AEIOU]",
        mask,
        text,
        flags=re.IGNORECASE,
    )
    return text_with_no_vowels

In [5]:
def tokens_frequency(dataset, use_tqdm=True):
    frequencies = defaultdict(int)
    dataset = tqdm(dataset) if use_tqdm else dataset
    for document in dataset:
        for token in document.split():
            frequencies[token] += 1
    frequencies = dict(frequencies)
    return frequencies


In [6]:
def characters_frequency(dataset, use_tqdm=True):
    frequencies = defaultdict(int)
    dataset = tqdm(dataset) if use_tqdm else dataset
    for document in dataset:
        for token in document.split():
            for character in token:
                frequencies[character] += 1
    frequencies = dict(frequencies)
    return frequencies

In [7]:
def calculate_entropy(tokens_frequency):
    # https://stackoverflow.com/q/43419803/4412324
    # https://stackoverflow.com/a/40496783/4412324
    total_number_of_tokens = sum(tokens_frequency.values())
    entropy = -sum(
        (word_frequency / total_number_of_tokens)
        * math.log2(word_frequency / total_number_of_tokens)
        for word_frequency in tokens_frequency.values()
    )
    return entropy

In [8]:
def preparation_map(example):
  example['consonants'] = mask_vowels(text=example['text'])
  example['masked_consonants'] = mask_vowels(text=example['text'],mask='a')
  return example

# Download datasets and calculate

## Wikitext

### Train

In [9]:
dataset_train = datasets.concatenate_datasets([
    datasets.load_dataset("wikitext", "wikitext-2-v1",split='train'),
    datasets.load_dataset("wikitext", "wikitext-2-v1",split='validation'),
  ]
)
dataset_train = dataset_train.map(preparation_map)
dataset_train

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-2-v1 to /root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.




Map:   0%|          | 0/40478 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'consonants', 'masked_consonants'],
    num_rows: 40478
})

#### Normal text

In [10]:
dataset_train_tokens_frequenceis = tokens_frequency(dataset=dataset_train['text'])
len(dataset_train_tokens_frequenceis.keys()),sum(dataset_train_tokens_frequenceis.values())

  0%|          | 0/40478 [00:00<?, ?it/s]

(33277, 2265796)

In [11]:
dataset_train_words_entropy = calculate_entropy(tokens_frequency=dataset_train_tokens_frequenceis)
dataset_train_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['text']))
dataset_train_words_entropy,dataset_train_chars_entropy

  0%|          | 0/40478 [00:00<?, ?it/s]

(10.210429390952958, 4.7925732182346055)

#### Consonants

In [12]:
dataset_train_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['consonants'])
len(dataset_train_consonants_tokens_frequenceis.keys()),sum(dataset_train_consonants_tokens_frequenceis.values())

  0%|          | 0/40478 [00:00<?, ?it/s]

(25538, 2221913)

In [13]:
dataset_train_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_consonants_tokens_frequenceis)
dataset_train_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['consonants']))
dataset_train_consonants_words_entropy,dataset_train_consonants_chars_entropy

  0%|          | 0/40478 [00:00<?, ?it/s]

(9.752561120678752, 4.657900736946544)

#### Masked Consonants

In [14]:
dataset_train_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['masked_consonants'])
len(dataset_train_masked_consonants_tokens_frequenceis.keys()),sum(dataset_train_masked_consonants_tokens_frequenceis.values())

  0%|          | 0/40478 [00:00<?, ?it/s]

(30090, 2265796)

In [15]:
dataset_train_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_masked_consonants_tokens_frequenceis)
dataset_train_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['masked_consonants']))
dataset_train_masked_consonants_words_entropy,dataset_train_masked_consonants_chars_entropy

  0%|          | 0/40478 [00:00<?, ?it/s]

(9.99171839397435, 3.9588486884932976)

### Test

In [16]:
dataset_test = datasets.concatenate_datasets([
    datasets.load_dataset("wikitext", "wikitext-2-v1",split='test'),
  ]
)
dataset_test = dataset_test.map(preparation_map)
dataset_test



Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'consonants', 'masked_consonants'],
    num_rows: 4358
})

#### Normal text

In [17]:
dataset_test_tokens_frequenceis = tokens_frequency(dataset=dataset_test['text'])
len(dataset_test_tokens_frequenceis.keys()),sum(dataset_test_tokens_frequenceis.values())

  0%|          | 0/4358 [00:00<?, ?it/s]

(14142, 241211)

In [18]:
dataset_test_words_entropy = calculate_entropy(tokens_frequency=dataset_test_tokens_frequenceis)
dataset_test_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['text']))
dataset_test_words_entropy,dataset_test_chars_entropy

  0%|          | 0/4358 [00:00<?, ?it/s]

(9.523078537786553, 4.8100252172961335)

#### Consonants

In [19]:
dataset_test_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['consonants'])
len(dataset_test_consonants_tokens_frequenceis.keys()),sum(dataset_test_consonants_tokens_frequenceis.values())

  0%|          | 0/4358 [00:00<?, ?it/s]

(11493, 236879)

In [20]:
dataset_test_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_consonants_tokens_frequenceis)
dataset_test_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['consonants']))
dataset_test_consonants_words_entropy,dataset_test_consonants_chars_entropy

  0%|          | 0/4358 [00:00<?, ?it/s]

(9.132135460311597, 4.6617011161850215)

#### Masked Consonants

In [21]:
dataset_test_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['masked_consonants'])
len(dataset_test_masked_consonants_tokens_frequenceis.keys()),sum(dataset_test_masked_consonants_tokens_frequenceis.values())

  0%|          | 0/4358 [00:00<?, ?it/s]

(13120, 241211)

In [22]:
dataset_test_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_masked_consonants_tokens_frequenceis)
dataset_test_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['masked_consonants']))
dataset_test_masked_consonants_words_entropy,dataset_test_masked_consonants_chars_entropy

  0%|          | 0/4358 [00:00<?, ?it/s]

(9.33702533888673, 3.9894762519701445)

## IMDB

### Train

In [None]:
dataset_train = datasets.load_dataset("imdb",split='train')
dataset_train = dataset_train.map(preparation_map)
dataset_train

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

#### Normal text

In [None]:
dataset_train_tokens_frequenceis = tokens_frequency(dataset=dataset_train['text'])
len(dataset_train_tokens_frequenceis.keys()),sum(dataset_train_tokens_frequenceis.values())

In [None]:
dataset_train_words_entropy = calculate_entropy(tokens_frequency=dataset_train_tokens_frequenceis)
dataset_train_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['text']))
dataset_train_words_entropy,dataset_train_chars_entropy

#### Consonants

In [None]:
dataset_train_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['consonants'])
len(dataset_train_consonants_tokens_frequenceis.keys()),sum(dataset_train_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_consonants_tokens_frequenceis)
dataset_train_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['consonants']))
dataset_train_consonants_words_entropy,dataset_train_consonants_chars_entropy

#### Masked Consonants

In [None]:
dataset_train_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['masked_consonants'])
len(dataset_train_masked_consonants_tokens_frequenceis.keys()),sum(dataset_train_masked_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_masked_consonants_tokens_frequenceis)
dataset_train_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['masked_consonants']))
dataset_train_masked_consonants_words_entropy,dataset_train_masked_consonants_chars_entropy

### Test

In [None]:
dataset_test = datasets.load_dataset("imdb",split='test')
dataset_test = dataset_test.map(preparation_map)
dataset_test

#### Normal text

In [None]:
dataset_test_tokens_frequenceis = tokens_frequency(dataset=dataset_test['text'])
len(dataset_test_tokens_frequenceis.keys()),sum(dataset_test_tokens_frequenceis.values())

In [None]:
dataset_test_words_entropy = calculate_entropy(tokens_frequency=dataset_test_tokens_frequenceis)
dataset_test_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['text']))
dataset_test_words_entropy,dataset_test_chars_entropy

#### Consonants

In [None]:
dataset_test_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['consonants'])
len(dataset_test_consonants_tokens_frequenceis.keys()),sum(dataset_test_consonants_tokens_frequenceis.values())

In [None]:
dataset_test_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_consonants_tokens_frequenceis)
dataset_test_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['consonants']))
dataset_test_consonants_words_entropy,dataset_test_consonants_chars_entropy

#### Masked Consonants

In [None]:
dataset_test_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['masked_consonants'])
len(dataset_test_masked_consonants_tokens_frequenceis.keys()),sum(dataset_test_masked_consonants_tokens_frequenceis.values())

In [None]:
dataset_test_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_masked_consonants_tokens_frequenceis)
dataset_test_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['masked_consonants']))
dataset_test_masked_consonants_words_entropy,dataset_test_masked_consonants_chars_entropy

## AGNews

### Train

In [None]:
dataset_train = datasets.load_dataset("ag_news",split='train')
dataset_train = dataset_train.map(preparation_map)
dataset_train

#### Normal text

In [None]:
dataset_train_tokens_frequenceis = tokens_frequency(dataset=dataset_train['text'])
len(dataset_train_tokens_frequenceis.keys()),sum(dataset_train_tokens_frequenceis.values())

In [None]:
dataset_train_words_entropy = calculate_entropy(tokens_frequency=dataset_train_tokens_frequenceis)
dataset_train_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['text']))
dataset_train_words_entropy,dataset_train_chars_entropy

#### Consonants

In [None]:
dataset_train_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['consonants'])
len(dataset_train_consonants_tokens_frequenceis.keys()),sum(dataset_train_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_consonants_tokens_frequenceis)
dataset_train_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['consonants']))
dataset_train_consonants_words_entropy,dataset_train_consonants_chars_entropy

#### Masked Consonants

In [None]:
dataset_train_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['masked_consonants'])
len(dataset_train_masked_consonants_tokens_frequenceis.keys()),sum(dataset_train_masked_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_masked_consonants_tokens_frequenceis)
dataset_train_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['masked_consonants']))
dataset_train_masked_consonants_words_entropy,dataset_train_masked_consonants_chars_entropy

### Test

In [None]:
dataset_test = datasets.load_dataset("ag_news",split='test')
dataset_test = dataset_test.map(preparation_map)
dataset_test



Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'consonants', 'masked_consonants'],
    num_rows: 7600
})

#### Normal text

In [None]:
dataset_test_tokens_frequenceis = tokens_frequency(dataset=dataset_test['text'])
len(dataset_test_tokens_frequenceis.keys()),sum(dataset_test_tokens_frequenceis.values())

  0%|          | 0/7600 [00:00<?, ?it/s]

(40819, 286690)

In [None]:
dataset_test_words_entropy = calculate_entropy(tokens_frequency=dataset_test_tokens_frequenceis)
dataset_test_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['text']))
dataset_test_words_entropy,dataset_test_chars_entropy

  0%|          | 0/7600 [00:00<?, ?it/s]

(11.799924745419073, 4.8390095425459)

#### Consonants

In [None]:
dataset_test_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['consonants'])
len(dataset_test_consonants_tokens_frequenceis.keys()),sum(dataset_test_consonants_tokens_frequenceis.values())

  0%|          | 0/7600 [00:00<?, ?it/s]

(34178, 279573)

In [None]:
dataset_test_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_consonants_tokens_frequenceis)
dataset_test_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['consonants']))
dataset_test_consonants_words_entropy,dataset_test_consonants_chars_entropy

  0%|          | 0/7600 [00:00<?, ?it/s]

(11.371770233733072, 4.679847172790962)

#### Masked Consonants

In [None]:
dataset_test_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['masked_consonants'])
len(dataset_test_masked_consonants_tokens_frequenceis.keys()),sum(dataset_test_masked_consonants_tokens_frequenceis.values())

  0%|          | 0/7600 [00:00<?, ?it/s]

(38046, 286690)

In [None]:
dataset_test_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_masked_consonants_tokens_frequenceis)
dataset_test_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['masked_consonants']))
dataset_test_masked_consonants_words_entropy,dataset_test_masked_consonants_chars_entropy

  0%|          | 0/7600 [00:00<?, ?it/s]

(11.559819202364775, 3.943109869136)

## EWT

### Train

In [None]:
dataset_train = datasets.concatenate_datasets([
    datasets.load_dataset("universal_dependencies", "en_ewt",split='train'),
    datasets.load_dataset("universal_dependencies", "en_ewt",split='validation'),
  ]
)
dataset_train = dataset_train.map(preparation_map)
dataset_train

#### Normal text

In [None]:
dataset_train_tokens_frequenceis = tokens_frequency(dataset=dataset_train['text'])
len(dataset_train_tokens_frequenceis.keys()),sum(dataset_train_tokens_frequenceis.values())

In [None]:
dataset_train_words_entropy = calculate_entropy(tokens_frequency=dataset_train_tokens_frequenceis)
dataset_train_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['text']))
dataset_train_words_entropy,dataset_train_chars_entropy

#### Consonants

In [None]:
dataset_train_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['consonants'])
len(dataset_train_consonants_tokens_frequenceis.keys()),sum(dataset_train_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_consonants_tokens_frequenceis)
dataset_train_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['consonants']))
dataset_train_consonants_words_entropy,dataset_train_consonants_chars_entropy

#### Masked Consonants

In [None]:
dataset_train_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['masked_consonants'])
len(dataset_train_masked_consonants_tokens_frequenceis.keys()),sum(dataset_train_masked_consonants_tokens_frequenceis.values())

In [None]:
dataset_train_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_masked_consonants_tokens_frequenceis)
dataset_train_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['masked_consonants']))
dataset_train_masked_consonants_words_entropy,dataset_train_masked_consonants_chars_entropy

### Test

In [None]:
dataset_test = datasets.concatenate_datasets([
    datasets.load_dataset("universal_dependencies", "en_ewt",split='test'),
  ]
)
dataset_test = dataset_test.map(preparation_map)
dataset_test

#### Normal text

In [None]:
dataset_test_tokens_frequenceis = tokens_frequency(dataset=dataset_test['text'])
len(dataset_test_tokens_frequenceis.keys()),sum(dataset_test_tokens_frequenceis.values())

In [None]:
dataset_test_words_entropy = calculate_entropy(tokens_frequency=dataset_test_tokens_frequenceis)
dataset_test_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['text']))
dataset_test_words_entropy,dataset_test_chars_entropy

#### Consonants

In [None]:
dataset_test_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['consonants'])
len(dataset_test_consonants_tokens_frequenceis.keys()),sum(dataset_test_consonants_tokens_frequenceis.values())

In [None]:
dataset_test_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_consonants_tokens_frequenceis)
dataset_test_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['consonants']))
dataset_test_consonants_words_entropy,dataset_test_consonants_chars_entropy

#### Masked Consonants

In [None]:
dataset_test_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['masked_consonants'])
len(dataset_test_masked_consonants_tokens_frequenceis.keys()),sum(dataset_test_masked_consonants_tokens_frequenceis.values())

In [None]:
dataset_test_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_masked_consonants_tokens_frequenceis)
dataset_test_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['masked_consonants']))
dataset_test_masked_consonants_words_entropy,dataset_test_masked_consonants_chars_entropy

## CoNllpp

### Train

In [None]:
dataset_train = datasets.concatenate_datasets([
    datasets.load_dataset("conllpp",split='train'),
    datasets.load_dataset("conllpp",split='validation'),
  ]
)
dataset_train = dataset_train.map(preparation_map)
dataset_train

Downloading builder script:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.33M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/191k [00:00<?, ?B/s]

Downloading and preparing dataset universal_dependencies/en_ewt to /root/.cache/huggingface/datasets/universal_dependencies/en_ewt/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/12543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2077 [00:00<?, ? examples/s]

Dataset universal_dependencies downloaded and prepared to /root/.cache/huggingface/datasets/universal_dependencies/en_ewt/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7. Subsequent calls will reuse this data.




Map:   0%|          | 0/14545 [00:00<?, ? examples/s]

Dataset({
    features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc', 'consonants', 'masked_consonants'],
    num_rows: 14545
})

#### Normal text

In [None]:
dataset_train_tokens_frequenceis = tokens_frequency(dataset=dataset_train['text'])
len(dataset_train_tokens_frequenceis.keys()),sum(dataset_train_tokens_frequenceis.values())

  0%|          | 0/14545 [00:00<?, ?it/s]

(32273, 199040)

In [None]:
dataset_train_words_entropy = calculate_entropy(tokens_frequency=dataset_train_tokens_frequenceis)
dataset_train_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['text']))
dataset_train_words_entropy,dataset_train_chars_entropy

  0%|          | 0/14545 [00:00<?, ?it/s]

(11.118721024275626, 4.761688566082719)

#### Consonants

In [None]:
dataset_train_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['consonants'])
len(dataset_train_consonants_tokens_frequenceis.keys()),sum(dataset_train_consonants_tokens_frequenceis.values())

  0%|          | 0/14545 [00:00<?, ?it/s]

(27305, 191482)

In [None]:
dataset_train_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_consonants_tokens_frequenceis)
dataset_train_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['consonants']))
dataset_train_consonants_words_entropy,dataset_train_consonants_chars_entropy

  0%|          | 0/14545 [00:00<?, ?it/s]

(10.644075077018593, 4.602155349196826)

#### Masked Consonants

In [None]:
dataset_train_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_train['masked_consonants'])
len(dataset_train_masked_consonants_tokens_frequenceis.keys()),sum(dataset_train_masked_consonants_tokens_frequenceis.values())

  0%|          | 0/14545 [00:00<?, ?it/s]

(30376, 199040)

In [None]:
dataset_train_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_train_masked_consonants_tokens_frequenceis)
dataset_train_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_train['masked_consonants']))
dataset_train_masked_consonants_words_entropy,dataset_train_masked_consonants_chars_entropy

  0%|          | 0/14545 [00:00<?, ?it/s]

(10.826853462818253, 3.8712313242205165)

### Test

In [None]:
dataset_test = datasets.concatenate_datasets([
    datasets.load_dataset("conllpp", split='test'),
  ]
)
dataset_test = dataset_test.map(preparation_map)
dataset_test



Map:   0%|          | 0/2077 [00:00<?, ? examples/s]

Dataset({
    features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc', 'consonants', 'masked_consonants'],
    num_rows: 2077
})

#### Normal text

In [None]:
dataset_test_tokens_frequenceis = tokens_frequency(dataset=dataset_test['text'])
len(dataset_test_tokens_frequenceis.keys()),sum(dataset_test_tokens_frequenceis.values())

  0%|          | 0/2077 [00:00<?, ?it/s]

(6900, 21533)

In [None]:
dataset_test_words_entropy = calculate_entropy(tokens_frequency=dataset_test_tokens_frequenceis)
dataset_test_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['text']))
dataset_test_words_entropy,dataset_test_chars_entropy

  0%|          | 0/2077 [00:00<?, ?it/s]

(10.500824859789402, 4.873337926633139)

#### Consonants

In [None]:
dataset_test_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['consonants'])
len(dataset_test_consonants_tokens_frequenceis.keys()),sum(dataset_test_consonants_tokens_frequenceis.values())

  0%|          | 0/2077 [00:00<?, ?it/s]

(6088, 20636)

In [None]:
dataset_test_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_consonants_tokens_frequenceis)
dataset_test_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['consonants']))
dataset_test_consonants_words_entropy,dataset_test_consonants_chars_entropy

  0%|          | 0/2077 [00:00<?, ?it/s]

(10.111984459697462, 4.747560587518187)

#### Masked Consonants

In [None]:
dataset_test_masked_consonants_tokens_frequenceis = tokens_frequency(dataset=dataset_test['masked_consonants'])
len(dataset_test_masked_consonants_tokens_frequenceis.keys()),sum(dataset_test_masked_consonants_tokens_frequenceis.values())

  0%|          | 0/2077 [00:00<?, ?it/s]

(6578, 21533)

In [None]:
dataset_test_masked_consonants_words_entropy = calculate_entropy(tokens_frequency=dataset_test_masked_consonants_tokens_frequenceis)
dataset_test_masked_consonants_chars_entropy = calculate_entropy(tokens_frequency=characters_frequency(dataset=dataset_test['masked_consonants']))
dataset_test_masked_consonants_words_entropy,dataset_test_masked_consonants_chars_entropy

  0%|          | 0/2077 [00:00<?, ?it/s]

(10.237897280837727, 3.9930801646494034)