In [1]:
# Load data
import iso639
import pandas as pd

df = pd.read_csv('DOAJ_metadata_20200703_0835_utf8.csv')

In [4]:
# Build set / dict
doaj_lang_set = set()
doaj_lang_counts = dict()
doaj_lang_counts_no_en_secnd = dict()
doaj_lang_count_total = 0
doaj_lang_count_total_no_en_secnd = 0
for lang_str in df['Full text language'].to_list():
    if type(lang_str) != str:
        continue
    lang_str = lang_str.replace('Modern Greek (1453-)', 'Greek')  # Greek fix
    lang_str = lang_str.replace(' (macrolanguage)', '')  # Swahili, Malay, Nepali fix
    lang_list = lang_str.split(',')
    lang_set_curr = set([l.strip().lower() for l in lang_list])  # Capitalization inconsistent
    doaj_lang_set = doaj_lang_set.union(lang_set_curr)
    # count English always
    for lang in lang_set_curr:
        if lang not in doaj_lang_counts:
            doaj_lang_counts[lang] = 0
        doaj_lang_counts[lang] += 1
        doaj_lang_count_total += 1
    # only count English if it's not an additional language
    if len(lang_set_curr) > 1 and 'english' in lang_set_curr:
        lang_set_curr.remove('english')
    for lang in lang_set_curr:
        if lang not in doaj_lang_counts_no_en_secnd:
            doaj_lang_counts_no_en_secnd[lang] = 0
        doaj_lang_counts_no_en_secnd[lang] += 1
        doaj_lang_count_total_no_en_secnd += 1

In [5]:
# Convert to language codes
cited_codes = set(['be', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'ko', 'la', 'lv', 'mk', 'mr', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sl', 'sr', 'sv', 'tr', 'uk', 'vi', 'zh'])

doaj_codes = set()
for lang in doaj_lang_set:
    try:
        doaj_codes.add(iso639.to_iso639_1(lang))
    except iso639.NonExistentLanguageError:
        print(lang)

serbo-croatian


In [6]:
# Language coverage
print('DOAJ langs: {}'.format(len(doaj_lang_set)))
print('DOAJ langs w/ iso639 code: {}'.format(len(doaj_codes)))
print('cited langs: {}'.format(len(cited_codes)))
print('Intersection: {}'.format(len(cited_codes.intersection(doaj_codes))))
print('Difference: {}'.format(cited_codes.difference(doaj_codes)))
print('{}/{} = {:.2f}%'.format(
    len(cited_codes.intersection(doaj_codes)),
    len(doaj_lang_set),
    (len(cited_codes.intersection(doaj_codes)) / len(doaj_lang_set)) * 100
))

DOAJ langs: 78
DOAJ langs w/ iso639 code: 77
cited langs: 44
Intersection: 43
Difference: {'mr'}
43/78 = 55.13%


In [7]:
# Weighted language coverage
covered_count = 0
for lang, count in doaj_lang_counts.items():
    try:
        code = iso639.to_iso639_1(lang)
    except iso639.NonExistentLanguageError:
        continue
    if code in cited_codes:
        covered_count += count
print('{}/{} = {:.2f}%'.format(
    covered_count,
    doaj_lang_count_total,
    (covered_count / doaj_lang_count_total) * 100
))

22412/22861 = 98.04%


In [8]:
# Weighted language coverage (English not counted when additional language)
covered_count_ne = 0
for lang, count in doaj_lang_counts_no_en_secnd.items():
    try:
        code = iso639.to_iso639_1(lang)
    except iso639.NonExistentLanguageError:
        continue
    if code in cited_codes:
        covered_count_ne += count
print('{}/{} = {:.2f}%'.format(
    covered_count_ne,
    doaj_lang_count_total_no_en_secnd,
    (covered_count_ne / doaj_lang_count_total_no_en_secnd) * 100
))

17790/18239 = 97.54%


In [46]:
for lang in df['Full text language'].unique():
    if type(lang) == float:
        continue
    if 'greek' in lang.lower():
        print(lang)

Modern Greek (1453-)
English, French, German, Modern Greek (1453-), Italian
English, Modern Greek (1453-), Latin, Slovenian
English, Modern Greek (1453-)
Albanian, Bulgarian, Catalan, Croatian, Czech, Dutch, English, Estonian, Finnish, French, Galician, German, Modern Greek (1453-), Hungarian, Italian, Lithuanian, Macedonian, Maltese, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swedish, Turkish, Ukrainian
English, French, Modern Greek (1453-), Russian
English, French, German, Italian, Modern Greek (1453-)
English, French, German, Modern Greek (1453-)
English, French, Modern Greek (1453-)
Modern Greek (1453-), Italian
