# Linking Languages

To create the graph between countries, one of the parameters which could link countries is the language. One could then decide to have a binary parameter : different or same language. However, many languages share a phylogeny, a same root and are more or less linked. Dutch and German for example share a similarity which French and German don't. That is why we wanted to define a distance between the different languages. As this did not exist is any opensource database we decided to create one ourselves from the genealogic tree of the different languages which can be found here http://glottolog.org/glottolog/family. Using the different trees for all high level families, we were able to load them using the Phyloxml library in biopython. From that we exported the tree as a graph in networkx and were then able to compute the shortest path between all languages of one family. This was done iterably through all the major families all the while filling out a dataframe of distances between the different languages. 

In [1]:
import pandas as pd
import json
from matplotlib import pyplot as plt
import networkx as nx
from scipy import spatial
import numpy as np
from Bio import Phylo
import re
import os
from tqdm import tqdm
import nltk
from nltk.metrics import edit_distance

In [2]:
def get_all_languages(df):
    languages = {}
    for i in df.languages:
        if type(i) != list:
            pass
        else:
            for j in i:
                languages.update({j: ''})  
    return list(languages.keys())

def find_closest_lang(lang, lang_list):
    
    dist = [edit_distance(lang,x) for x in lang_list]
    return lang_list[dist.index(min(dist))]
        
    
def match_languages(filename, country_languages,languages, simp_languages, do_prints = False):
    match = dict()
    idx_match = dict()
    bad_match = dict()
    no_match = list()

    for lang in country_languages:
        
        match1 = [x for x in simp_languages if lang == x]
        if not len(match1):
            match1 = [x for x in simp_languages if lang+"ic" == x]
        if not len(match1):
            match1 = [x for x in simp_languages if lang+"n" == x]

        match2 = [x for x in simp_languages if lang in x]
        if len(match1):
            match.update({lang: [simp_languages.index(match1[0]), match1[0], languages[simp_languages.index(match1[0])], lang]})
        elif not len(match1) and len(match2):
            if len(match2) == 1:
                match.update({lang: [simp_languages.index(match2[0]), match2[0], languages[simp_languages.index(match2[0])], lang]})
            elif len(match2)>1 :
                closest_lang = find_closest_lang(lang, match2)
                match.update({lang: [simp_languages.index(closest_lang), closest_lang, languages[simp_languages.index(closest_lang)], lang]})
            else:
                bad_match.update({lang: match2})
        else:
            no_match.append(lang)

    if do_prints : 
        print("Bad match ({}): ".format(len(bad_match)), bad_match)
        print()
        print("No match at all ({}): ".format(len(no_match)), no_match)
    
    return match, bad_match, no_match

def load_tree(filename, country_languages, vis_adj = False):
    tree = Phylo.read(filename, 'newick')
    # convert the tree to a networkx graph
    net = Phylo.to_networkx(tree)
    
    # Visualize the adjacency matrix if desired
    if vis_adj:
        A = nx.adjacency_matrix(net)
        plt.spy(A.todense(),  markersize=1)
        plt.show()
        
    # Get the different languages from the network nodes
    languages = [[x, x.name] for x in net.nodes()]
    # Remove the language codes 
    simp_languages =  [re.sub(r'\[.*$', '', x[1]) for x in languages]
    simp_languages =  [re.sub(r'\{.*$', '', x) for x in simp_languages]
    simp_languages = [re.sub(r' +$', "", x) for x in simp_languages]
    
    return net, languages, simp_languages
    
def compute_language_dist(country_languages, phylogeny_files, lang_dist):
    do_prints = False
    all_bad_match = dict()
    all_no_match = dict()
    
    for file in tqdm(phylogeny_files):
        net, languages, simp_languages = load_tree(file, country_languages)
        
        # Match the languages from the 
        match, bad_match, no_match = match_languages(file, country_languages,languages, simp_languages)
        
        if len(bad_match):
            for elem in zip(bad_match.keys(), bad_match.values()):
                print(elem[0], elem[1])
                print()
            if do_prints : print(bad_match)
        # all_bad_match.update(bad_match)
        # all_no_match.update(no_match)
        
        p=nx.shortest_path_length(net)

        for idx1 in list(match.values()):
            for idx2 in list(match.values()):
                lang_dist.loc[idx1[3], idx2[3]] = p[idx1[2][0]][idx2[2][0]]
    if do_prints : print(len(country_languages), len(match))

    return lang_dist, all_bad_match, all_no_match


In [3]:
# Loading the dataframe containing all official languages
data = pd.read_pickle(r'../DataEnriching/data.pickle')
country_languages = get_all_languages(data)

country_languages[country_languages.index('Belizean Creole')] = "Belize Kriol English"
country_languages[country_languages.index('Māori')] = "Maori"
country_languages[country_languages.index('Tongan')] = "Rarotongan"
country_languages[country_languages.index('Kirundi')] = "Rundi"
country_languages[country_languages.index('Tshiluba')] = "Luba-Lulua"
country_languages[country_languages.index('Cook Islands Māori')] = "Maori"
country_languages[country_languages.index('Jamaican Patois')] = "Jamaican Creole English"
country_languages[country_languages.index('Hassaniya')] = "Hassaniyya"
country_languages[country_languages.index('Slovene')] = "Slovenian"
country_languages[country_languages.index('Khoisan')] = "Afrikaans"
country_languages[country_languages.index('Chibarwe')] = "Zimbabwean Ndebele"
country_languages[country_languages.index('Swazi')] = "Swati"
country_languages[country_languages.index('Sorani')] = "Central Kurdish"
country_languages[country_languages.index('Northern Ndebele')] = "Zimbabwean Ndebele"
country_languages[country_languages.index('Maldivian')] = "Dhivehi"
country_languages[country_languages.index('Zimbabwean Sign Language')] = "Zimbabwe Sign Language"
country_languages[country_languages.index('Montenegrin')] = "Karashevski"
country_languages[country_languages.index('Mauritian Creole')] = "Morisyen"
country_languages[country_languages.index('Seychellois Creole')] = "Seselwa Creole French"
country_languages[country_languages.index('Kyrgyz')] = "Kirghiz"
country_languages[country_languages.index('Norfuk')] = "Pitcairn-Norfolk"
country_languages[country_languages.index('Jèrriais')] = "Jerriais"
country_languages[country_languages.index('Guernésiais')] = "Dgernesiais"

# Creating the language distance dataframe
lang_dist = pd.DataFrame(columns = country_languages, index = country_languages)

for lang in country_languages:
    lang_dist[lang] = np.inf

# Loading all the phylogeny files
phylogeny_files = os.listdir(os.path.join(os.getcwd(),"PhylogenyFiles"))
phylogeny_files = [os.path.join(os.getcwd(),"PhylogenyFiles",x) for x in phylogeny_files if "newick" in x]

print("There are {} high level families".format(len(phylogeny_files)))

There are 178 high level families


Computing distance between languages based on the phylogenetical trees

In [4]:
lang_dist, bad_match, no_match = compute_language_dist(country_languages, phylogeny_files, lang_dist)

lang_dist.loc['Austro-Bavarian German', :] = lang_dist.loc['German', :]+1
lang_dist.loc['Austro-Bavarian German', 'Austro-Bavarian German'] = 0

#lang_dist.loc['Austro-Bavarian German', :] = lang_dist.loc['German', :]+1
#lang_dist.loc['Austro-Bavarian German', 'Austro-Bavarian German'] = 0


100%|██████████| 178/178 [00:44<00:00,  4.03it/s]


Determining elements which have not yet been assigned. 

In [5]:
df = lang_dist.describe().transpose()
unassigned = df.loc[df["min"] == np.inf].index.tolist()
print(len(unassigned))
print(unassigned)

  x2 = take(ap, indices_above, axis=axis) * weights_above


0
[]


In [6]:

lang_dist

Unnamed: 0,Dutch,Papiamento,Dari,Pashto,Turkmen,Portuguese,English,Swedish,Albanian,Catalan,...,Xhosa,Zulu,Zimbabwean Ndebele,Kalanga,Afrikaans,Ndau,Zimbabwean Ndebele.1,Shona,Tonga,Zimbabwe Sign Language
Dutch,0.000000,23.000000,18.000000,12.000000,inf,4.000000,3.000000,3.000000,3.000000,2.000000,...,inf,2.000000,inf,16.000000,6.000000,inf,inf,inf,inf,3.000000
Papiamento,23.000000,0.000000,23.000000,17.000000,inf,4.000000,23.000000,21.000000,15.000000,6.000000,...,inf,inf,inf,21.000000,21.000000,inf,inf,inf,inf,inf
Dari,18.000000,23.000000,0.000000,8.000000,inf,23.000000,18.000000,16.000000,10.000000,19.000000,...,inf,inf,inf,14.000000,16.000000,inf,inf,inf,inf,inf
Pashto,12.000000,17.000000,8.000000,0.000000,inf,17.000000,12.000000,10.000000,4.000000,13.000000,...,inf,inf,inf,8.000000,10.000000,inf,inf,inf,inf,inf
Turkmen,inf,inf,inf,inf,0.000000,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
Portuguese,4.000000,4.000000,23.000000,17.000000,inf,0.000000,3.000000,1.000000,3.000000,4.000000,...,inf,2.000000,inf,21.000000,21.000000,inf,inf,inf,inf,3.000000
English,3.000000,23.000000,18.000000,12.000000,inf,3.000000,0.000000,3.000000,10.000000,19.000000,...,inf,3.000000,inf,16.000000,10.000000,inf,inf,inf,inf,inf
Swedish,3.000000,21.000000,16.000000,10.000000,inf,1.000000,3.000000,0.000000,2.000000,3.000000,...,inf,2.000000,inf,14.000000,10.000000,inf,inf,inf,inf,2.000000
Albanian,3.000000,15.000000,10.000000,4.000000,inf,3.000000,10.000000,2.000000,0.000000,3.000000,...,inf,inf,inf,8.000000,8.000000,inf,inf,inf,inf,2.000000
Catalan,2.000000,6.000000,19.000000,13.000000,inf,4.000000,19.000000,3.000000,3.000000,0.000000,...,inf,inf,inf,17.000000,17.000000,inf,inf,inf,inf,3.000000
