In [1]:
%matplotlib widget
import os
import sys

import numpy as np
from collections import defaultdict, Counter
#from nltk.util import ngrams

import pandas as pd
#import seaborn as sb

import sklearn

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 6]
from pandas import DataFrame
import lang2vec.lang2vec as l2v
from statistics import mean

# 1. Typological index using syntactic features from lang2vec (TI_syn)

In [2]:
xcopa_codes=pd.read_csv('../Data/isomappings/xcopa-processed.10000.csv', index_col=0)
xquad_codes=pd.read_csv('../Data/isomappings/xquad-processed.10000.csv', index_col=0)
tydiqa_codes=pd.read_csv('../Data/isomappings/tydiqa-processed.10000.csv', index_col=0)
xnli_codes=pd.read_csv('../Data/isomappings/xnli-processed.10000.csv', index_col=0)
xtreme_codes=pd.read_csv('../Data/isomappings/xtreme-processed.10000.csv', index_col=0)
xglue_codes=pd.read_csv('../Data/isomappings/xglue-processed.10000.csv', index_col=0)
ud_codes=pd.read_csv('../Data/isomappings/ud-processed.tsv', sep='\t', index_col=0)
teddi_codes=pd.read_csv('../Data/isomappings/teddi500.csv', index_col=0)
mbert_codes=pd.read_csv('../Data/isomappings/mbertwiki-processed.10000.csv', index_col=0)
bible_codes=pd.read_csv('../Data/isomappings/biblecorpus100-processed.10000.csv', index_col=0)

In [3]:
#manual substitutions for problematic cases
ud_codes.loc["UD_Western_Armenian-ArmTDP.txt"].at["ISO_6393"]="hy"
mbert_codes.loc["armenian"].at["ISO_6393"]="hy"
mbert_codes.loc["vowiki-latest-pages-articles"].at["ISO_6393"]="vol"
bible_codes=bible_codes.drop(["crp.txt"])
bible_codes.loc["jap.txt"].at["ISO_6393"]="jpn"


- Function for extracting the syntactic feature vectors according to the ISO language codes:

In [4]:
def get_l2v(dataset_codes):
    #list of iso codes to query the l2v vectors:
    codes=dataset_codes["ISO_6393"].str.lower().tolist()
    #codes=xcopa_codes.index.tolist()
    
    features = l2v.get_features(codes, "syntax_knn")
    #features = l2v.get_features(codes, "syntax_average")

    features_frame = pd.DataFrame.from_dict(features).transpose()
    return (features_frame)

- Function for calculating the entropy (TI measure):

In [6]:
def get_entropy(df): #frame with features
    entropies=[]
    for index in range(len(df.columns)): #We are processing column by column
        p= np.ones(2)
        freqs=df[index].to_numpy() #We convert the column to a Numpy array
        #We extract the number of zeros, and the number of ones:
        ones=len(freqs[freqs == 1])
        zeros=len(freqs[freqs == 0])
        #We calculate the prob:
        p_ones = ones/len(freqs)    #Probability (relative frequency), e.g., 8/11, 3/11
        p_zeros = zeros/len(freqs) 
        p[0]=p_ones
        p[1]=p_zeros
        p=p[p != 0]  #We extract only the values not equal to zero. (otherwise we get a nan when aplying the algorithm)
        H=-(p*np.log2(p)).sum()  #Entropy calculation
        entropies.append(H)      #We store the entropy of each row in an array
        #print(ones,zeros, p_ones, p_zeros, H)
    return(entropies) #entropy feature-wise
  

In [7]:
entropies_features=get_entropy(get_l2v(xcopa_codes)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xquad_codes)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(tydiqa_codes)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xnli_codes)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xtreme_codes)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xglue_codes)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(ud_codes)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(teddi_codes)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(mbert_codes)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(bible_codes)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.5862770789920277
xquad 0.5231360114362537
tydiqa 0.626102007988479
xnli 0.5570648270071721
xtreme 0.6116017688923218
xglue 0.5167326017164338
ud 0.5674507124838225
teddi 0.7063106049093542
mbert 0.5590294778394355
bibles 0.649122540960284


# 2. Typological index using word length as features (TI_morph)

In [8]:
#***All the code to do the sampling, calculating measures, and finally obtain these tsvs, is not included in this notebook***
teddi_10k=pd.read_csv('../Data/wordlength_results/RESULTS_teddi_tokens/sample10000.tsv', sep='\t',index_col=0)
ud_10k=pd.read_csv('../Data/wordlength_results/RESULTS_ud-processed_tokens/ud-processed.10000.stats.tsv', sep='\t',index_col=0)
bibles_10k=pd.read_csv('../Data/wordlength_results/RESULTS_biblecorpus100-processed_tokens/biblecorpus100-processed.10000.stats.tsv', sep='\t',index_col=0)
xcopa_10k=pd.read_csv('../Data/wordlength_results/RESULTS_xcopa-processed_tokens/xcopa-processed.10000.stats.tsv', sep='\t',index_col=0)
tydiqa_10k=pd.read_csv('../Data/wordlength_results/RESULTS_tydiqa-processed_tokens/tydiqa-processed.10000.stats.tsv', sep='\t',index_col=0)
xquad_10k=pd.read_csv('../Data/wordlength_results/RESULTS_xquad-processed_tokens/xquad-processed.10000.stats.tsv', sep='\t',index_col=0)
xnli_10k=pd.read_csv('../Data/wordlength_results/RESULTS_xnli-processed_tokens/xnli-processed.10000.stats.tsv', sep='\t',index_col=0)
xglue_10k=pd.read_csv('../Data/wordlength_results/RESULTS_xglue-processed_tokens/xglue-processed.10000.stats.tsv', sep='\t',index_col=0)
xtreme_10k=pd.read_csv('../Data/wordlength_results/RESULTS_xtreme-processed_tokens/xtreme-processed.10000.stats.tsv', sep='\t',index_col=0)
mbert_10k=pd.read_csv('../Data/wordlength_results/RESULTS_mbertwiki-processed_tokens/mbertwiki-processed.10000.stats.tsv', sep='\t',index_col=0)

- Function for creating binary vectors, using the word lengths as features


In [12]:
def get_wordlength_vectors(dataset):
    #For each language initialize an array of elements (the maximum possoble word length in all lamguages)
    langs=dataset.index.tolist()
    vectors_hash={}
    for l in langs:    

        binary_vector= np.zeros(11)

        #wordlength=dataset.loc[l]['Avg_length']
        wordlength=dataset.loc[l]['Median_length'] #for median

        #if Avg_length it is betwwen 0 and 1: assign 1 to the first bin (element of the array), and so on:
        
        #binary_vector[(round(wordlength))-1]=1 #binarization using round numbers
        binary_vector[(int(wordlength))-1]=1  #binarization using the integer part of the number
        #print (l, wordlength, round(wordlength), binary_vector)
        #print (l, wordlength, binary_vector)
        
        vectors_hash[l]= binary_vector
        
    
    return(pd.DataFrame.from_dict(vectors_hash).transpose()) #a dataframe with one vector per row 

The  binarization is based on splitting the word lenght into bins . Example:

If a language has word_length 4.9, its vector will have  "1" in the position 4. If  bins are splitted into: 0-1, 1-2, ...

0 0 0 1 0 0 0 ...

In [13]:
def get_wordlength_vectors(dataset, binsize):
    #For each language initialize an array of elements (the maximum possoble word length in all lamguages)
    
    bins=np.arange(1, 11.2, binsize) #[ 1. ,  1.1,  1.2,  1.3... ]
    
    langs=dataset.index.tolist()
    vectors_hash={}
    
    for l in langs:    

        binary_vector= np.zeros(len(bins))

        wordlength=dataset.loc[l]['Avg_length']
        index=len(np.arange(1, wordlength, binsize)) #we partition the word length in the same bins, the total size of the array is the index for putting a 1 in the binary vector
        #print(l, wordlength, index)
        #index=bins.round(decimals=2).tolist().index(round(wordlength,1)) #locate the index that has that word length (index starts at zero)
       

        #if Avg_length it is betwwen 0 and 1: assign 1 to the first bin (element of the array), and so on:
        
        binary_vector[index-1]=1  
        
        vectors_hash[l]= binary_vector
        
    
    return(pd.DataFrame.from_dict(vectors_hash).transpose()) #a dataframe with one vector per row 

In [14]:
print("Bin Size: 1")

print("Vector features:", str(len(np.arange(1, 11.2, 1))))

Bin Size: 1
Vector features: 11


In [15]:
entropies_features=get_entropy(get_wordlength_vectors(xcopa_10k, 1)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xquad_10k, 1)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(tydiqa_10k, 1)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xnli_10k, 1)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xtreme_10k, 1)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xglue_10k, 1)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(ud_10k, 1)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(teddi_10k, 1)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(mbert_10k, 1)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(bibles_10k, 1)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.36103779596241575
xquad 0.34093043571111287
tydiqa 0.3433145652165989
xnli 0.33874266714204293
xtreme 0.31093208189005345
xglue 0.3072073812912939
ud 0.34890225383819307
teddi 0.36874805539335226
mbert 0.32349322087652765
bibles 0.31080272059778274
