# Decode the latin characters

ASCII transliterations of Unicode text:
[Unidecode](https://pypi.org/project/Unidecode/)

In [1]:
import collections

import pandas as pd
from unidecode import unidecode

## download table of latin [graphemes](Grapheme)

In [2]:
df = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_Latin-script_letters',
    attrs={'class':'wikitable'}
)

### Basic latin characters [&#x1F517;](https://en.wikipedia.org/wiki/List_of_Latin-script_letters#Basic_Latin)

In [3]:
raw_base = df[0]
raw_base.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,Aa,Bb,Cc,Dd,Ee,Ff,Gg,Hh,Ii,Jj,...,Qq,Rr,Ss,Tt,Uu,Vv,Ww,Xx,Yy,Zz


In [4]:
base = raw_base.T.values.reshape(1, 26).tolist()[0]
base = sum(map(list, base), [])
' '.join(base)

'A a B b C c D d E e F f G g H h I i J j K k L l M m N n O o P p Q q R r S s T t U u V v W w X x Y y Z z'

### Extensions [&#x1F517;](https://en.wikipedia.org/wiki/List_of_Latin-script_letters#Extensions)

In [5]:
raw_extensions = df[1]
raw_extensions.head()

Unnamed: 0,Letter,Name,Notes
0,ᴀ,Small capital A,Nonstandard phonetic symbol; Uralic Phonetic A...
1,Ɐ ɐ ᵄ,Turned A,Near-open central vowel[3]
2,Ɑ ɑ ᵅ,Alpha (script A),"IPA (open back unrounded vowel); Duka, Fe'fe, ..."
3,ꬰ,Barred alpha,Otto Bremer's phonetic transcription system fo...
4,ꭤ,Inverted alpha,Americanist phonetics[5]


In [6]:
extensions = raw_extensions['Letter'].str.split(' ').values.tolist()
extensions = sum(extensions, [])
extensions[:10]

['ᴀ', 'Ɐ', 'ɐ', 'ᵄ', 'Ɑ', 'ɑ', 'ᵅ', 'ꬰ', 'ꭤ', 'Ɒ']

### Letters with diacritics [&#x1F517;](https://en.wikipedia.org/wiki/List_of_Latin-script_letters#Letters_with_diacritics)

In [7]:
raw_diacritics = df[2]
raw_diacritics.head()

Unnamed: 0,Letter,Name,Notes
0,ẚ,A with right half ring,
1,À à,A with grave,"Aghem, Ahlon, Arammba, Awing, Baka, Bali (Adam..."
2,Á á,A with acute,"Afrikaans, Aghem, Ahlon, Arammba, Awing, Bafia..."
3,Â â,A with circumflex,"Awing, Bangolan, Berber, Dutch, Emilian, Ewond..."
4,Ầ ầ,A with circumflex and grave,Vietnamese


In [8]:
diacritics = raw_diacritics['Letter'].str.split(' ').values.tolist()
diacritics = sum(diacritics, [])
diacritics[:10]

['ẚ', 'À', 'à', 'Á', 'á', 'Â', 'â', 'Ầ', 'ầ', 'Ấ']

### Ligatures [&#x1F517;](https://en.wikipedia.org/wiki/List_of_Latin-script_letters#Ligatures)

In [9]:
raw_ligatures = df[3]
raw_ligatures.head()

Unnamed: 0,Letter,Name,Notes
0,Ꜳ ꜳ 𐞀,AA,Medieval Nordic vowel /aː/;[9] Superscript for...
1,Ꜳ́ ꜳ́,AA with acute,
2,Ꜳ̋ ꜳ̋,AA with double acute,
3,Ꜳ̇ ꜳ̇,AA with dot above,
4,Ꜳ̈ ꜳ̈,AA with diaeresis,


In [10]:
ligatures = raw_ligatures['Letter'].str.split(' ').values.tolist()
ligatures = sum(ligatures, [])
ligatures[:10]

['Ꜳ', 'ꜳ', '\U00010780', 'Ꜳ́', 'ꜳ́', 'Ꜳ̋', 'ꜳ̋', 'Ꜳ̇', 'ꜳ̇', 'Ꜳ̈']

## decode all characters

In [11]:
to_decode = base + extensions + diacritics + ligatures
print('Number of characters', len(to_decode))

Number of characters 4139


In [12]:
decoded = collections.defaultdict(list)

for char in to_decode:
    _decoded = unidecode(char)
    decoded[_decoded].append(char)
    
decoded = dict(decoded)

In [15]:
for k, v in decoded.items():
    print('-- ', k, ' -- ')
    print(' '.join(v), '\n')

--  A  -- 
A ᴀ Ɐ À Á Â Ầ Ấ Ẫ Ẩ Ã Ã̀ Ã́ Ã̂ Ã̌ Ã̍ Ã̎ Ā Ā̀ Ā́ Ā̂ Ā̃ Ā̃́ Ā̄ Ā̆ Ā̆́ Ā̈ Ā̊ Ā̌ Ă Ằ Ắ Ẵ Ẳ Ȧ Ȧ́ Ǡ Ä Ä́ Ä̀ Ä̂ Ä̃ Ǟ Ǟ̆ Ä̆ Ä̌ Ả Å Å Ǻ Å̂ Å̃ Å̄ Å̄̆ Å̆ A̋ Ǎ A̍ A̎ Ȁ Ȃ A̐ A̓ A̧ À̧ Á̧ Â̧ Ǎ̧ A̭ A̰ À̰ Á̰ Ā̰ Ä̰ Ä̰́ Ą Ą̀ Ą́ Ą̂ Ą̃ Ą̄ Ą̄̀ Ą̄́ Ą̄̂ Ą̄̌ Ą̇ Ą̈ Ą̈̀ Ą̈́ Ą̈̂ Ą̈̌ Ą̈̄ Ą̊ Ą̌ Ą̋ Ą̱ Ą̱̀ Ą̱́ A᷎ A̱ À̱ Á̱ Â̱ Ã̱ Ā̱ Ā̱̀ Ā̱́ Ā̱̂ Ä̱ Ä̱̀ Ä̱́ Ä̱̂ Ä̱̌ Å̱ Ǎ̱ A̱̥ Ạ Ạ́ Ạ̀ Ậ Ạ̃ Ạ̄ Ặ Ạ̈ Ạ̈̀ Ạ̈́ Ạ̈̂ Ạ̈̌ Ạ̌ Ạ̍ A̤ À̤ Á̤ Â̤ Ä̤ Ḁ Ḁ̂ Ḁ̈ A̯ A̩ À̩ Á̩ Â̩ Ã̩ Ā̩ Ǎ̩ A̩̍ A̩̓ A͔ Ā͔ Ⱥ Ⱥ̀ Ⱥ́ 

--  a  -- 
a ɐ ᵄ ɑ ᵅ ɒ ẚ à á â ầ ấ ẫ ẩ ã ã̀ ã́ ã̂ ã̌ ã̍ ã̎ ā ā̀ ā́ ā̂ ā̃ ā̃́ ā̄ ā̆ ā̆́ ā̈ ā̊ ā̌ ă ằ ắ ẵ ẳ ȧ ȧ́ ǡ ä ä́ ä̀ ä̂ ä̃ ǟ ǟ̆ ä̆ ä̌ ả å ǻ å̂ å̃ å̄ å̄̆ å̆ a̋ ǎ a̍ a̎ ȁ ȃ a̐ a̓ a̧ à̧ á̧ â̧ ǎ̧ a̭ a̰ à̰ á̰ ā̰ ä̰ ä̰́ ą ą̀ ą́ ą̂ ą̃ ą̄ ą̄̀ ą̄́ ą̄̂ ą̄̌ ą̇ ą̈ ą̈̀ ą̈́ ą̈̂ ą̈̌ ą̈̄ ą̊ ą̌ ą̋ ą̱ ą̱̀ ą̱́ a᷎ a̱ à̱ á̱ â̱ ã̱ ā̱ ā̱̀ ā̱́ ā̱̂ ä̱ ä̱̀ ä̱́ ä̱̂ ä̱̌ å̱ ǎ̱ a̱̥ ạ ạ́ ạ̀ ậ ạ̃ ạ̄ ặ ạ̈ ạ̈̀ ạ̈́ ạ̈̂ ạ̈̌ ạ̌ ạ̍ a̤ à̤ á̤ â̤ ä̤ ḁ ḁ̂ ḁ̈ a̯ a̩ à̩ á̩ â̩ ã̩ ā̩ ǎ̩ a̩̍ a̩̓ a͔ ā͔ ⱥ ⱥ̀ ⱥ́ ɑ̀ ɑ́ ɑ̂ ɑ̃ ɑ̄ ɑ̆ ɑ̇ ɑ̈ ɑ̊ ɑ̌ 

--  B  -- 
B ʙ ᴃ ᴯ B̀ B́ B̂ B̃