In [1]:
import iso639
import re

lang_patt = re.compile(r'\(\s*in\s+([a-zA-Z][a-z]+)\s*\)')

bibitems = []
with open('in_lang.tsv') as f:
    for line in f:
        txt, aid, uuid = line.strip().split('\t')
        m = lang_patt.search(txt)
        lang = m.group(1).lower()
        bibitems.append([lang, txt, aid, uuid])

In [2]:
langs = list(set([bi[0] for bi in bibitems]))
langs.sort()
print('{} unique tokens'.format(len(langs)))
langs

207 unique tokens


['addition',
 'angstrom',
 'angstroms',
 'appear',
 'arbeit',
 'armenian',
 'basic',
 'beijing',
 'belarusian',
 'billions',
 'bulgarian',
 'bulk',
 'catalan',
 'china',
 'chines',
 'chinese',
 'chineses',
 'chinses',
 'components',
 'coq',
 'costruction',
 'cremonese',
 'croatian',
 'cubes',
 'czech',
 'danish',
 'degrees',
 'delta',
 'deutsch',
 'development',
 'dutch',
 'electrons',
 'energy',
 'english',
 'esperanto',
 'estonian',
 'exabytes',
 'facsimile',
 'farsi',
 'finish',
 'finnish',
 'fortran',
 'franch',
 'french',
 'gauss',
 'geman',
 'general',
 'georgian',
 'gerhardt',
 'germain',
 'german',
 'greek',
 'hebrew',
 'hindi',
 'hindsight',
 'hugarian',
 'hungarian',
 'hungary',
 'hz',
 'icelandic',
 'indonesian',
 'italian',
 'itallian',
 'janpanese',
 'japanase',
 'japanease',
 'japaneese',
 'japanes',
 'japanese',
 'japaneses',
 'japaniese',
 'japanse',
 'japansese',
 'japenese',
 'japense',
 'japnese',
 'java',
 'kelvin',
 'korean',
 'latin',
 'latvian',
 'macedonian',
 '

In [3]:
not_langs = ['addition',
 'angstrom',
 'angstroms',
 'appear',
 'arbeit',
 'basic',
 'beijing',
 'billions',
 'bulk',
 'china',
 'components',
 'coq',
 'costruction',
 'cubes',
 'cremonese',
 'degrees',
 'delta',
 'development',
 'electrons',
 'energy',
 'exabytes',
 'facsimile',
 'fortran',
 'gauss',
 'general',
 'gerhardt',
 'hindsight',
 'hungary',
 'hz',
 'java',
 'kelvin',
 'magma',
 'magnitude',
 'magnitudes',
 'mass',
 'matlab',
 'mean',
 'medien',
 'millions',
 'minutes',
 'modulus',
 'nmr',
 'notes',
 'optics',
 'part',
 'particular',
 'peparation',
 'percent',
 'perspectives',
 'pictures',
 'prees',
 'prep',
 'prepaparation',
 'prepapartion',
 'prepapation',
 'preparaion',
 'preparaiton',
 'preparasion',
 'preparation',
 'preparationa',
 'preparationb',
 'preparations',
 'preparaton',
 'prepare',
 'preparetion',
 'preparing',
 'prepartion',
 'prepatation',
 'prepation',
 'preperation',
 'prepration',
 'preprint',
 'press',
 'pressa',
 'pressb',
 'pressing',
 'presss',
 'principle',
 'print',
 'printa',
 'printb',
 'printing',
 'process',
 'production',
 'progress',
 'proofs',
 'ptess',
 'publication',
 'publish',
 'python',
 'referee',
 'review',
 'revision',
 'rpess',
 'rusia',
 'russia',
 'saturation',
 'seales',
 'section',
 'space',
 'spacetime',
 'spirit',
 'submission',
 'supercell',
 'time',
 'turn',
 'ukraine',
 'vacuum',
 'wh',
 'work',
 'writing',
 'years']
lang_errors = {
 'chinese': ['chines',
             'chineses',
             'chinses'],
 'german': ['deutsch',
            'germain',
            'geman'],
 'finnish': ['finish'],
 'french': ['franch'],
 'hungarian': ['hugarian'],
 'italian': ['itallian'],
 'japanese': ['janpanese',
              'japanease',
              'japaneese',
              'japanes',
              'japanase',
              'japaneses',
              'japaniese',
              'japanse',
              'japansese',
              'japenese',
              'japense',
              'japnese'],
 'macedonian': ['makedonian'],
 'farsi': ['persian'],
 'portuguese': ['portuguise',
                'protuguese'],
 'russian': ['rassian',
             'rissian',
             'rrussian',
             'ruassian',
             'rus',
             'rusian',
             'russ',
             'russain',
             'russan',
             'russe',
             'russiam',
             'russian',
             'russiand',
             'russin',
             'russina',
             'russinan',
             'russion',
             'russisan',
             'russsian',
             'rudssian'],
 'romanian': ['roumanian'],
 'slovenian': ['slovene'],
 'ukrainian': ['ukainian',
               'ukraininian',
               'ukranian',
               'ukrainain']
 }
lang_fixes = {}
for correct, wrong in lang_errors.items():
    for w in wrong:
        lang_fixes[w] = correct

bibitems_clean = []
for bi in bibitems:
    lang, txt, aid, uuid = bi
    if lang in not_langs:
        continue
    if lang in lang_fixes:
        lang = lang_fixes[lang]
    bibitems_clean.append([lang, txt, aid, uuid])

In [4]:
langs_clean = list(set([bi[0] for bi in bibitems_clean]))
langs_clean.sort()
print('{} unique tokens'.format(len(langs_clean)))
langs_clean

44 unique tokens


['armenian',
 'belarusian',
 'bulgarian',
 'catalan',
 'chinese',
 'croatian',
 'czech',
 'danish',
 'dutch',
 'english',
 'esperanto',
 'estonian',
 'farsi',
 'finnish',
 'french',
 'georgian',
 'german',
 'greek',
 'hebrew',
 'hindi',
 'hungarian',
 'icelandic',
 'indonesian',
 'italian',
 'japanese',
 'korean',
 'latin',
 'latvian',
 'macedonian',
 'marathi',
 'norwegian',
 'polish',
 'portuguese',
 'romanian',
 'russian',
 'sanskrit',
 'serbian',
 'slovak',
 'slovenian',
 'spanish',
 'swedish',
 'turkish',
 'ukrainian',
 'vietnamese']

In [9]:
lang_codes = [iso639.to_iso639_1(l) for l in langs_clean]
print(', '.join(sorted(lang_codes)))

be, bg, ca, cs, da, de, el, en, eo, es, et, fa, fi, fr, he, hi, hr, hu, hy, id, is, it, ja, ka, ko, la, lv, mk, mr, nl, no, pl, pt, ro, ru, sa, sk, sl, sr, sv, tr, uk, vi, zh


In [6]:
with open('in_lang_cleaned.tsv', 'w') as f:
    for bi in bibitems_clean:
        line = '\t'.join(bi)
        f.write(f'{line}\n')
with open('in_lang_cleaned_ids', 'w') as f:
    for aid in set([bi[2] for bi in bibitems_clean]):
        f.write(f'{aid}\n')
with open('in_lang_cleaned_uuids', 'w') as f:
    for bi in bibitems_clean:
        uuid = bi[3]
        f.write(f'{uuid}\n')