In [22]:
import numpy as np
import pandas as pd
import seaborn as sb
from pandas import DataFrame
from scipy.spatial.distance import squareform, pdist
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import defaultdict
import random
import statistics

## Correlations using random samples of treebanks at different levels (language, genus, family)
Weighted Tau correlations between LAS scores and:
- Probes: Probe-A, Probe-L
- BPE: BPE-based vectors, using squared Euclidean distance
- msl, mwl: Mean Word Length, Mean Sentence, using squared Euclidean distance
- Uriel (Lang2vec) 

**Random sampling:**
1. For each language associated with more than one treebank, we randomly select just one treebank (the languages that initially had only one treebank remain the same)
2. We adapt the dataframes of all the measures, so they contain only the selected treebanks of the random sample (we sample first for the columns:target languages and then for the rows: transfer lamguages)
3. We calculate the correlations between the measures.
4. We repeat the process using different random seeds (different random sampling of treebanks), and we average the correlations obtained in the different random samples. 

---
Functions that we're going to need:

In [23]:

###################

def correlation_flatten(df1, df2):
    taus={}
    
    tau, p_value = stats.spearmanr(df1.transpose().to_numpy().flatten(), df2.transpose().to_numpy().flatten(), nan_policy='omit') #Spearman Correlation
    return(tau)  #returns the correlation between two distance matrices 
##########################################3

def assign_nans(df1):# Input: a dataframe
                     #Output: a dataframe with NaNs values instead of the score when the trasnfer and target are the same UD
#if col=row put a Nan
    for col in df1.columns:
        for row in df1[col].index:
            if col==row:
                #print(col, row, df1.loc[row, col])
                df1.loc[row, col]=np.NaN
    return(df1)
            

#idea firs sample columns, then rows
##################################################################################
def sample_treebanks(original, seed, mode): #Input: A dataframe with  UD treebanks (columns: target treebank rows: transfer treebank), 
                                                #A random seed
                                            #Ouput: A list with a sample of treebanks (one per language)
    UDs= defaultdict(list)
    if mode =="columns":
        treebanks=original.columns
    else:
        treebanks=original.index
    
    for col in treebanks:  #All languages treebanks (columns)
        tmplist=[]
        x = col.split("UD_")[1].split("-")
        languagename=x[0] #Language Name
        #print (languagename, col)
        tmplist.append(col)
        #Create a Dictionary languagename:(treebank1, treebank2, ...)
        if languagename not in UDs:
            UDs[languagename]=tmplist
        else:
            UDs[languagename].append(col)
    #UDs contains each languagename and the associated treebanks
    
    ####Random sampling:#####
    random.seed(a=seed)

    #Foreach key (language name) randomly select just one treebank:  if array greater than 1, sample, assign to new dict
    UDs_sampled= defaultdict(list)  #UDs_sampled contains each languagename and the randomly selected treebank
    for key, value in UDs.items() :
        if len(value)>1: #only for Languages with more than one UD
            #print (key, value)
            randomtreebank=random.choice(value)
            UDs_sampled[key]=randomtreebank
        else:
            UDs_sampled[key]=value[0] #Languages with only one treebank, remain the same. 

    #print(UDs_sampled)

    sample=list(UDs_sampled.values())  #The random sample
    return sample

###################

def sample_phylogenetic(field, df_genus, k, s): 
    names=df_genus[field].value_counts().index  #name of  genus in the dataset
    strings=[]
    for n in names:

        filtered=genus[genus[field]==n]
        filtered=filtered.sample(n=k, random_state=s)  #numer of random samples
        language=filtered["long_name"].tolist()[0]
        language=language.replace(" ", "_")
        language="UD_"+language+"-"
        strings.append(language)
        
    return (strings)  #Return a List of sampled  pefixes, e.g., 'UD_Old_Church_Slavonic-', 'UD_German-', 'UD_Romanian-',...

Reading the distance matrices for each measure (columns: target languages; rows: transfer languages)

In [24]:
las_path="../data/las_distances.tsv"
bpe_path="../data/bpe_distances.tsv"
deprel_path="../data/deprel_distances.tsv"
struct_path="../data/struct_distances.tsv"
mwl_path="../data/mwl_distances.tsv"
msl_path="../data/msl_distances.tsv"
l2v_path="../data/l2v_distances.tsv" 

bpe = pd.read_csv(bpe_path,index_col=0, sep="\t")
las = pd.read_csv(las_path,index_col=0, sep="\t")
mwl = pd.read_csv(mwl_path,index_col=0, sep="\t")
msl = pd.read_csv(msl_path,index_col=0, sep="\t")
l2v = pd.read_csv(l2v_path,index_col=0, sep="\t")
deprel = pd.read_csv(deprel_path,index_col=0, sep="\t")
struct = pd.read_csv(struct_path,index_col=0, sep="\t")

las=assign_nans(las)
bpe=assign_nans(bpe)
mwl=assign_nans(mwl)
msl=assign_nans(msl)
l2v=assign_nans(l2v)
deprel=assign_nans(deprel)
struct=assign_nans(struct)


print("LAS", las.shape)
print("BPE", bpe.shape)
print("Probe-L", deprel.shape)
print("Probe-A", struct.shape)
print("MWL", mwl.shape)
print("MSL", msl.shape)
print("L2V", l2v.shape)


LAS (78, 116)
BPE (78, 116)
Probe-L (78, 116)
Probe-A (78, 116)
MWL (78, 116)
MSL (78, 116)
L2V (78, 116)


In [25]:
las

Unnamed: 0,UD_Afrikaans-AfriBooms,UD_Ancient_Greek-PROIEL,UD_Ancient_Greek-Perseus,UD_Arabic-PADT,UD_Armenian-ArmTDP,UD_Basque-BDT,UD_Belarusian-HSE,UD_Bulgarian-BTB,UD_Catalan-AnCora,UD_Chinese-GSD,...,UD_Turkish-Penn,UD_Turkish-Tourism,UD_Turkish_German-SAGT,UD_Ukrainian-IU,UD_Urdu-UDTB,UD_Uyghur-UDT,UD_Vietnamese-VTB,UD_Welsh-CCG,UD_Western_Armenian-ArmTDP,UD_Wolof-WTB
UD_Ancient_Greek-PROIEL,0.239045,,0.394669,0.096090,0.192595,0.137580,0.276568,0.247747,0.192340,0.075811,...,0.195167,0.101013,0.188286,0.266921,0.085591,0.035231,0.120809,0.163681,0.204588,0.029701
UD_Ancient_Greek-Perseus,0.167200,0.444038,,0.092242,0.214099,0.191201,0.267717,0.244826,0.139874,0.118850,...,0.240921,0.222576,0.192453,0.266205,0.082642,0.083145,0.172573,0.137470,0.232473,0.051575
UD_Arabic-PADT,0.218732,0.108263,0.101107,,0.208489,0.169081,0.402737,0.391634,0.276109,0.083472,...,0.177295,0.060396,0.155413,0.410721,0.075372,0.073281,0.229894,0.315085,0.205379,0.074152
UD_Basque-BDT,0.317848,0.117199,0.130834,0.166166,0.472700,,0.411713,0.398098,0.319768,0.279713,...,0.467115,0.349831,0.326183,0.410165,0.230711,0.088031,0.326646,0.254811,0.449224,0.081778
UD_Belarusian-HSE,0.442731,0.215646,0.204518,0.259601,0.502431,0.344594,,0.714153,0.531711,0.249625,...,0.412210,0.199614,0.375029,0.795355,0.203415,0.046787,0.344016,0.414289,0.465441,0.111780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UD_Turkish-Penn,0.257476,0.064240,0.078834,0.078004,0.305348,0.231998,0.324776,0.315930,0.204929,0.223565,...,,0.482972,0.300486,0.321164,0.223373,0.134254,0.137137,0.136474,0.296351,0.043849
UD_Turkish-Tourism,0.040624,0.029593,0.045358,0.007581,0.071055,0.058145,0.063649,0.045932,0.021022,0.041696,...,0.166714,,0.055097,0.046369,0.056580,0.188181,0.028140,0.024884,0.074261,0.022577
UD_Ukrainian-IU,0.473951,0.206636,0.190919,0.262757,0.516455,0.364266,0.753688,0.725030,0.602642,0.280186,...,0.458965,0.334298,0.385678,,0.223647,0.062758,0.363210,0.460186,0.489865,0.097933
UD_Urdu-UDTB,0.404363,0.106578,0.114931,0.110944,0.447270,0.419797,0.387358,0.393312,0.249689,0.234463,...,0.495139,0.373951,0.286751,0.420743,,0.177377,0.191767,0.197412,0.410758,0.035922


 - **Language level**
 
Calculating correlations for the treebanks random samples (one treebank per language):

In [26]:
deprel_means= [] 
struct_means= []
bpe_means= []
msl_means= []
mwl_means= []
l2v_means= []
l2vvslas_detailed=[]
mwlvslas_detailed=[]
mslvslas_detailed=[]
bpevslas_detailed=[]
deprelvslas_detailed=[]
structvslas_detailed=[]
for s in range(1, 31): #random seeds
    randomsample_columns=sample_treebanks(las, s, "columns")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language
    randomsample_rows=sample_treebanks(las, s, "rows")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language

    
    #We addapt the dataframes (rows and columns) so they contain only the treebanks in the random sample 

    bpe_sampled=bpe[[c for c in bpe.columns if c in randomsample_columns]]
    bpe_sampled=bpe_sampled.loc[bpe_sampled.index.intersection(randomsample_rows)] 

    l2v_sampled=l2v[[c for c in l2v.columns if c in randomsample_columns]]
    l2v_sampled=l2v_sampled.loc[l2v_sampled.index.intersection(randomsample_rows)]  

    las_sampled=las[[c for c in las.columns if c in randomsample_columns]]
    las_sampled=las_sampled.loc[las_sampled.index.intersection(randomsample_rows)]  

    msl_sampled=msl[[c for c in msl.columns if c in randomsample_columns]]
    msl_sampled=msl_sampled.loc[msl_sampled.index.intersection(randomsample_rows)]  

    mwl_sampled=mwl[[c for c in mwl.columns if c in randomsample_columns]]
    mwl_sampled=mwl_sampled.loc[mwl_sampled.index.intersection(randomsample_rows)]  

    struct_sampled=struct[[c for c in struct.columns if c in randomsample_columns]]
    struct_sampled=struct_sampled.loc[struct_sampled.index.intersection(randomsample_rows)]  

    deprel_sampled=deprel[[c for c in deprel.columns if c in randomsample_columns]]
    deprel_sampled=deprel_sampled.loc[deprel_sampled.index.intersection(randomsample_rows)]  
    
        

    #We store the results  for each random sample (seed)
    deprel_means.append( correlation_flatten(deprel_sampled, las_sampled)) 
    struct_means.append(correlation_flatten(struct_sampled, las_sampled))
    bpe_means.append(correlation_flatten(bpe_sampled, las_sampled))
    msl_means.append(correlation_flatten(msl_sampled, las_sampled))
    mwl_means.append(correlation_flatten(mwl_sampled, las_sampled))
    l2v_means.append(correlation_flatten(l2v_sampled, las_sampled))
#We print the mean taking into account all the random samples of treebanks:
print("######")
print ("Mean and standard deviation over the different samples:")
print ("r(Probe-L,las)", statistics.mean(deprel_means), "SD:", statistics.stdev(deprel_means))
print("r(Probe-A,las)",statistics.mean(struct_means), "SD:", statistics.stdev(struct_means) )
print("r(bpe,las)", statistics.mean(bpe_means), "SD:", statistics.stdev(bpe_means) )
print("r(msl,las)", statistics.mean(msl_means), "SD:", statistics.stdev(msl_means) )
print("r(mwl,las)", statistics.mean(mwl_means), "SD:", statistics.stdev(mwl_means) )
print("r(l2v,las)", statistics.mean(l2v_means), "SD:", statistics.stdev(l2v_means) )

######
Mean and standard deviation over the different samples:
r(Probe-L,las) -0.5731838078042815 SD: 0.026800966222526614
r(Probe-A,las) -0.6614008678924165 SD: 0.015241501589277208
r(bpe,las) -0.39355601697022224 SD: 0.014751773954518614
r(msl,las) -0.12461642784867656 SD: 0.02680313409677038
r(mwl,las) -0.3836948878682873 SD: 0.018316005021113052
r(l2v,las) -0.4800453500752769 SD: 0.009048928510701418


 - **Genus level**
 
Calculating correlations for the treebanks random samples (one treebank per genus):

In [6]:
genus_path="../data/genus_family.tsv"
genus = pd.read_csv(genus_path,index_col=0, sep="\t")

Language families:

In [7]:
genus['family'].value_counts()

Indo-European     47
Afro-Asiatic       4
Uralic             3
Sino-Tibetan       2
Turkic             2
Dravidian          2
Atlantic-Congo     1
Basque             1
Austroasiatic      1
Austronesian       1
Koreanic           1
Japonic            1
Sign Language      1
Name: family, dtype: int64

In [8]:
genus['genus'].value_counts()

Balto-Slavic              14
Germanic                  13
Italic                     9
Indo-Iranian               4
Celtic                     3
Semitic                    3
Common Turkic              2
Graeco-Phrygian            2
South Dravidian            2
Armenic                    2
Finnic                     2
Sinitic                    2
Egyptian                   1
Korean                     1
North-Central Atlantic     1
Japanesic                  1
Swedish Sign               1
Basque                     1
Hungarian                  1
Vietic                     1
Malayo-Polynesian          1
Name: genus, dtype: int64

Genus:

In [9]:
deprel_means= [] 
struct_means= []
bpe_means= []
msl_means= []
mwl_means= []
l2v_means= []
for s in range(1, 91): #random seeds
    tree_sample_columns=sample_treebanks(las, s, "columns")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language
    tree_sample_rows=sample_treebanks(las, s, "rows")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language
    prefixes=sample_phylogenetic('genus', genus, 1,s)
    randomsample_columns=[]
    for t in tree_sample_columns:
        for p in prefixes: #prefixes
            if p in t:
                #print(t,p)
                randomsample_columns.append(t)
                next
                
    randomsample_rows=[]
    for t in tree_sample_rows:
        for p in prefixes: #prefixes
            if p in t:
                #print(t,p)
                randomsample_rows.append(t)
                next
    #if len(randomsample) != len(strings):
        #print(len(finalsample), len(strings),"ERROR")
    
    #We addapt the dataframes (rows and columns) so they contain only the treebanks in the random sample 

    bpe_sampled=bpe[[c for c in bpe.columns if c in randomsample_columns]]
    bpe_sampled=bpe_sampled.loc[bpe_sampled.index.intersection(randomsample_rows)] 

    l2v_sampled=l2v[[c for c in l2v.columns if c in randomsample_columns]]
    l2v_sampled=l2v_sampled.loc[l2v_sampled.index.intersection(randomsample_rows)]  

    las_sampled=las[[c for c in las.columns if c in randomsample_columns]]
    las_sampled=las_sampled.loc[las_sampled.index.intersection(randomsample_rows)]  

    msl_sampled=msl[[c for c in msl.columns if c in randomsample_columns]]
    msl_sampled=msl_sampled.loc[msl_sampled.index.intersection(randomsample_rows)]  

    mwl_sampled=mwl[[c for c in mwl.columns if c in randomsample_columns]]
    mwl_sampled=mwl_sampled.loc[mwl_sampled.index.intersection(randomsample_rows)]  

    struct_sampled=struct[[c for c in struct.columns if c in randomsample_columns]]
    struct_sampled=struct_sampled.loc[struct_sampled.index.intersection(randomsample_rows)]  

    deprel_sampled=deprel[[c for c in deprel.columns if c in randomsample_columns]]
    deprel_sampled=deprel_sampled.loc[deprel_sampled.index.intersection(randomsample_rows)]
    
    
    deprel_means.append( correlation_flatten(deprel_sampled, las_sampled)) 
    struct_means.append(correlation_flatten(struct_sampled, las_sampled))
    bpe_means.append(correlation_flatten(bpe_sampled, las_sampled))
    msl_means.append(correlation_flatten(msl_sampled, las_sampled))
    mwl_means.append(correlation_flatten(mwl_sampled, las_sampled))
    l2v_means.append(correlation_flatten(l2v_sampled, las_sampled))
    
#We print the mean taking into account all the random samples of treebanks:
print("######")
print ("Mean and standard deviation over the different samples:")
print ("r(Probe-L,las)", statistics.mean(deprel_means), "SD:", statistics.stdev(deprel_means))
print("r(Probe-A,las)",statistics.mean(struct_means), "SD:", statistics.stdev(struct_means) )
print("r(bpe,las)", statistics.mean(bpe_means), "SD:", statistics.stdev(bpe_means) )
print("r(msl,las)", statistics.mean(msl_means), "SD:", statistics.stdev(msl_means) )
print("r(mwl,las)", statistics.mean(mwl_means), "SD:", statistics.stdev(mwl_means) )
print("r(l2v,las)", statistics.mean(l2v_means), "SD:", statistics.stdev(l2v_means) )


######
Mean and standard deviation over the different samples:
r(Probe-L,las) -0.38182535755097863 SD: 0.11888433556930283
r(Probe-A,las) -0.5273456827921336 SD: 0.09996750663056896
r(bpe,las) -0.26195625642009585 SD: 0.09251593661171925
r(msl,las) -0.13904044925133746 SD: 0.12073535811566322
r(mwl,las) -0.3588640049679678 SD: 0.10401714822700062
r(l2v,las) -0.38986011293399087 SD: 0.06740711116237684


 - **Family level**
 
Calculating correlations for the treebanks random samples (one treebank per linguistic family):

In [15]:
deprel_means= [] 
struct_means= []
bpe_means= []
msl_means= []
mwl_means= []
l2v_means= []
for s in range(1, 91): #random seeds
    tree_sample_columns=sample_treebanks(las, s, "columns")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language
    tree_sample_rows=sample_treebanks(las, s, "rows")  #returns a list with the a random sample of treebanks. The new sample will have only one treebank per language
    prefixes=sample_phylogenetic('family', genus, 1,s)
    randomsample_columns=[]
    for t in tree_sample_columns:
        for p in prefixes: #prefixes
            if p in t:
                #print(t,p)
                randomsample_columns.append(t)
                next
                
    randomsample_rows=[]
    for t in tree_sample_rows:
        for p in prefixes: #prefixes
            if p in t:
                #print(t,p)
                randomsample_rows.append(t)
                next
    #if len(randomsample) != len(strings):
        #print(len(finalsample), len(strings),"ERROR")
    
    #We addapt the dataframes (rows and columns) so they contain only the treebanks in the random sample 

    bpe_sampled=bpe[[c for c in bpe.columns if c in randomsample_columns]]
    bpe_sampled=bpe_sampled.loc[bpe_sampled.index.intersection(randomsample_rows)] 

    l2v_sampled=l2v[[c for c in l2v.columns if c in randomsample_columns]]
    l2v_sampled=l2v_sampled.loc[l2v_sampled.index.intersection(randomsample_rows)]  

    las_sampled=las[[c for c in las.columns if c in randomsample_columns]]
    las_sampled=las_sampled.loc[las_sampled.index.intersection(randomsample_rows)]  

    msl_sampled=msl[[c for c in msl.columns if c in randomsample_columns]]
    msl_sampled=msl_sampled.loc[msl_sampled.index.intersection(randomsample_rows)]  

    mwl_sampled=mwl[[c for c in mwl.columns if c in randomsample_columns]]
    mwl_sampled=mwl_sampled.loc[mwl_sampled.index.intersection(randomsample_rows)]  

    struct_sampled=struct[[c for c in struct.columns if c in randomsample_columns]]
    struct_sampled=struct_sampled.loc[struct_sampled.index.intersection(randomsample_rows)]  

    deprel_sampled=deprel[[c for c in deprel.columns if c in randomsample_columns]]
    deprel_sampled=deprel_sampled.loc[deprel_sampled.index.intersection(randomsample_rows)]
    
    deprel_means.append( correlation_flatten(deprel_sampled, las_sampled)) 
    struct_means.append(correlation_flatten(struct_sampled, las_sampled))
    bpe_means.append(correlation_flatten(bpe_sampled, las_sampled))
    msl_means.append(correlation_flatten(msl_sampled, las_sampled))
    mwl_means.append(correlation_flatten(mwl_sampled, las_sampled))
    l2v_means.append(correlation_flatten(l2v_sampled, las_sampled))
    
#We print the mean taking into account all the random samples of treebanks:
print("######")
print ("Mean and standard deviation over the different samples:")
print ("r(Probe-L,las)", statistics.mean(deprel_means), "SD:", statistics.stdev(deprel_means))
print("r(Probe-A,las)",statistics.mean(struct_means), "SD:", statistics.stdev(struct_means) )
print("r(bpe,las)", statistics.mean(bpe_means), "SD:", statistics.stdev(bpe_means) )
print("r(msl,las)", statistics.mean(msl_means), "SD:", statistics.stdev(msl_means) )
print("r(mwl,las)", statistics.mean(mwl_means), "SD:", statistics.stdev(mwl_means) )
print("r(l2v,las)", statistics.mean(l2v_means), "SD:", statistics.stdev(l2v_means) )

######
Mean and standard deviation over the different samples:
r(Probe-L,las) -0.3234887293325139 SD: 0.12697717037461523
r(Probe-A,las) -0.5023880172839862 SD: 0.09328706249790328
r(bpe,las) -0.10309658502713762 SD: 0.09195194518140425
r(msl,las) -0.1641456815316146 SD: 0.1659228652546703
r(mwl,las) -0.3366129749372456 SD: 0.11314834784055636
r(l2v,las) -0.34817651671308103 SD: 0.05537272537558211
