In [2]:
import datasets
from english_consonants.processing import (
    characters_frequency,
    calculate_entropy,
    tokens_frequency,
    mask_vowels,
)

In [3]:
dataset = datasets.load_dataset('bentrevett/multi30k')
dataset

Found cached dataset json (/home/majed_alshaibani/.cache/huggingface/datasets/bentrevett___json/bentrevett--multi30k-951fb90011086fcf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [4]:
def consonants_only_map(example):
    example['consonants_en'] = mask_vowels(text=example['en'])
    return example

def masked_vowels_map(example):
    example['masked_vowels_en'] = mask_vowels(text=example['en'],mask="#")
    return example

dataset = dataset.map(consonants_only_map)
dataset = dataset.map(masked_vowels_map)
dataset

Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/bentrevett___json/bentrevett--multi30k-951fb90011086fcf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-7f99eec438466d6f.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/bentrevett___json/bentrevett--multi30k-951fb90011086fcf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-c113fa8925857989.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/bentrevett___json/bentrevett--multi30k-951fb90011086fcf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-0d2c27cf330b1faa.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/bentrevett___json/bentrevett--multi30k-951fb90011086fcf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-720bb1849589dd19.arrow
Loading cached processed dat

DatasetDict({
    train: Dataset({
        features: ['en', 'de', 'consonants_en', 'masked_vowels_en'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de', 'consonants_en', 'masked_vowels_en'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de', 'consonants_en', 'masked_vowels_en'],
        num_rows: 1000
    })
})

In [5]:
dataset['train']['en'][0],dataset['train']['consonants_en'][0],dataset['train']['masked_vowels_en'][0]

('Two young, White males are outside near many bushes.',
 'Tw yng, Wht mls r tsd nr mny bshs.',
 'Tw# y##ng, Wh#t# m#l#s #r# ##ts#d# n##r m#ny b#sh#s.')

## Tokens Stats

In [6]:
normal_english_tokens_frequencies = tokens_frequency(dataset=tuple(dataset['train']['en']))
len(normal_english_tokens_frequencies.keys()),sum(normal_english_tokens_frequencies.values())

  0%|          | 0/29000 [00:00<?, ?it/s]

(15456, 345020)

In [7]:
normal_english_chars_frequencies = characters_frequency(dataset=tuple(dataset['train']['en']))
len(normal_english_chars_frequencies.keys()),sum(normal_english_chars_frequencies.values())

  0%|          | 0/29000 [00:00<?, ?it/s]

(79, 1456216)

In [8]:
consonants_only_english_tokens_frequencies = tokens_frequency(
    dataset=tuple(dataset["train"]["consonants_en"])
)
len(consonants_only_english_tokens_frequencies.keys()), sum(
    consonants_only_english_tokens_frequencies.values()
)

  0%|          | 0/29000 [00:00<?, ?it/s]

(12844, 295833)

In [9]:
consonants_only_english_chars_frequencies = characters_frequency(
    dataset=tuple(dataset["train"]["consonants_en"])
)
len(consonants_only_english_chars_frequencies.keys()), sum(
    consonants_only_english_chars_frequencies.values()
)

  0%|          | 0/29000 [00:00<?, ?it/s]

(69, 912011)

In [10]:
masked_vowels_english_tokens_frequencies = tokens_frequency(
    dataset=tuple(dataset["train"]["masked_vowels_en"])
)
len(masked_vowels_english_tokens_frequencies.keys()), sum(
    masked_vowels_english_tokens_frequencies.values()
)

  0%|          | 0/29000 [00:00<?, ?it/s]

(14394, 345020)

In [11]:
masked_vowels_english_chars_frequencies = characters_frequency(
    dataset=tuple(dataset["train"]["masked_vowels_en"])
)
len(masked_vowels_english_chars_frequencies.keys()), sum(
    masked_vowels_english_chars_frequencies.values()
)

  0%|          | 0/29000 [00:00<?, ?it/s]

(69, 1456216)

## Entropy

In [12]:
calculate_entropy(tokens_frequency=normal_english_tokens_frequencies),\
calculate_entropy(tokens_frequency=normal_english_chars_frequencies)

(8.856201849300316, 4.386301597936478)

In [13]:
calculate_entropy(tokens_frequency=consonants_only_english_tokens_frequencies),\
calculate_entropy(tokens_frequency=consonants_only_english_chars_frequencies)

(9.025648540722566, 4.075140430496584)

In [14]:
calculate_entropy(tokens_frequency=masked_vowels_english_tokens_frequencies),\
calculate_entropy(tokens_frequency=masked_vowels_english_chars_frequencies)

(8.509751097224115, 3.5055741938619325)

## More extrapolation

In [19]:
from collections import defaultdict

vowels_only_words = defaultdict(int)

for document in dataset['train']['en']:
    for word in document.split():
        if set(word.lower()).issubset('aioue'):
            print(document)

Several men in hard hats are operating a giant pulley system.
A little girl climbing into a wooden playhouse.
A little girl climbing into a wooden playhouse.
A man in a blue shirt is standing on a ladder cleaning a window.
A man in a blue shirt is standing on a ladder cleaning a window.
A man in a blue shirt is standing on a ladder cleaning a window.
A man in a blue shirt is standing on a ladder cleaning a window.
A man in green holds a guitar while the other man observes his shirt.
A man in green holds a guitar while the other man observes his shirt.
A man is smiling at a stuffed lion
A man is smiling at a stuffed lion
A trendy girl talking on her cellphone while gliding slowly down the street.
A woman with a large purse is walking by a gate.
A woman with a large purse is walking by a gate.
A woman with a large purse is walking by a gate.
A ballet class of five girls jumping in sequence.
Four guys three wearing hats one not are jumping at the top of a staircase.
A black dog and a spot

In [18]:
vowels_only_words

defaultdict(int,
            {'a': 31704,
             'A': 17457,
             'i': 3,
             'U': 1,
             'I': 16,
             'OU': 1,
             'Au': 1,
             'e': 1,
             'AAA': 1,
             'O': 2})