In [11]:
%matplotlib widget
import os
import sys

import numpy as np
from collections import defaultdict, Counter
#from nltk.util import ngrams

import pandas as pd
#import seaborn as sb

import sklearn

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 6]
from pandas import DataFrame
import lang2vec.lang2vec as l2v
from statistics import mean

# 1. Typological index using syntactic features (lang2vec)

In [12]:
xcopa_codes=pd.read_csv('../mappings/xcopa-processed.10000.csv', index_col=0)
xquad_codes=pd.read_csv('../mappings/xquad-processed.10000.csv', index_col=0)
tydiqa_codes=pd.read_csv('../mappings/tydiqa-processed.10000.csv', index_col=0)
xnli_codes=pd.read_csv('../mappings/xnli-processed.10000.csv', index_col=0)
xtreme_codes=pd.read_csv('../mappings/xtreme-processed.10000.csv', index_col=0)
xglue_codes=pd.read_csv('../mappings/xglue-processed.10000.csv', index_col=0)
ud_codes=pd.read_csv('../mappings/ud-processed.tsv', sep='\t', index_col=0)
teddi_codes=pd.read_csv('../mappings/sample500.csv', index_col=0)
mbert_codes=pd.read_csv('../mappings/mbertwiki-processed.10000.csv', index_col=0)
bible_codes=pd.read_csv('../mappings/biblecorpus100-processed.10000.csv', index_col=0)

In [13]:
#manual substitutions for problematic cases
ud_codes.loc["UD_Western_Armenian-ArmTDP.txt"].at["ISO_6393"]="hy"
mbert_codes.loc["armenian"].at["ISO_6393"]="hy"
mbert_codes.loc["vowiki-latest-pages-articles"].at["ISO_6393"]="vol"
bible_codes=bible_codes.drop(["crp.txt"])
bible_codes.loc["jap.txt"].at["ISO_6393"]="jpn"


Function for exracting the vectors according to the language codes:

In [14]:
def get_l2v(dataset_codes):
    #list of iso codes to query the l2v vectors:
    codes=dataset_codes["ISO_6393"].str.lower().tolist()
    #codes=xcopa_codes.index.tolist()
    
    features = l2v.get_features(codes, "syntax_knn")
    #features = l2v.get_features(codes, "syntax_average")

    features_frame = pd.DataFrame.from_dict(features).transpose()
    return (features_frame)

Function for calculating the entropy:

In [15]:
def get_entropy(df): #frame with features
    entropies=[]
    for index in range(len(df.columns)): #We are processing column by column
        p= np.ones(2)
        freqs=df[index].to_numpy() #We convert the column to a Numpy array
        #We extract the number of zeros, and the number of ones:
        ones=len(freqs[freqs == 1])
        zeros=len(freqs[freqs == 0])
        #We calculate the prob:
        p_ones = ones/len(freqs)    #Probability (relative frequency), e.g., 8/11, 3/11
        p_zeros = zeros/len(freqs) 
        p[0]=p_ones
        p[1]=p_zeros
        p=p[p != 0]  #We extract only the values not equal to zero. (other wise we get a nan when aplying the algorithm)
        H=-(p*np.log2(p)).sum()  #Entropy calculation
        entropies.append(H)      #We store the entropy of each row in an array
        #print(ones,zeros, p_ones, p_zeros, H)
    return(entropies) #entropy feature-wise
  

In [16]:
entropies_features=get_entropy(get_l2v(xcopa_codes)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xquad_codes)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(tydiqa_codes)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xnli_codes)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xtreme_codes)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(xglue_codes)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(ud_codes)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(teddi_codes)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(mbert_codes)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_l2v(bible_codes)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.5862770789920277
xquad 0.5231360114362537
tydiqa 0.626102007988479
xnli 0.5570648270071721
xtreme 0.6116017688923218
xglue 0.5167326017164338
ud 0.5674507124838225
teddi 0.7063106049093542
mbert 0.5590294778394355
bibles 0.649122540960284


# 2. Typological index using word length as features 

---
For Word Length: 

We have to create first vectors: eng 1 0 0 0 0 ...

In [19]:
#***All the code to do the sampling, calculating measures, and finally obtain these tsvs, is not included in this notebook***
teddi_10k=pd.read_csv('../wordlength_results/RESULTS_100LC_tokens/sample10000_logographicadjusted.tsv', sep='\t',index_col=0)
ud_10k=pd.read_csv('../wordlength_results/RESULTS_ud-processed_tokens/ud-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
bibles_10k=pd.read_csv('../wordlength_results/RESULTS_biblecorpus100-processed_tokens/biblecorpus100-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
xcopa_10k=pd.read_csv('../wordlength_results/RESULTS_xcopa-processed_tokens/xcopa-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
tydiqa_10k=pd.read_csv('../wordlength_results/RESULTS_tydiqa-processed_tokens/tydiqa-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
xquad_10k=pd.read_csv('../wordlength_results/RESULTS_xquad-processed_tokens/xquad-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
xnli_10k=pd.read_csv('../wordlength_results/RESULTS_xnli-processed_tokens/xnli-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
xglue_10k=pd.read_csv('../wordlength_results/RESULTS_xglue-processed_tokens/xglue-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
xtreme_10k=pd.read_csv('../wordlength_results/RESULTS_xtreme-processed_tokens/xtreme-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)
mbert_10k=pd.read_csv('../wordlength_results/RESULTS_mbertwiki-processed_tokens/mbertwiki-processed.10000.stats_adjusted.tsv', sep='\t',index_col=0)

Function for creating binary vectors, using the word length. Example:



In [20]:
def get_wordlength_vectors(dataset):
    #For each language initialize an array of elements (the maximum possoble word length in all lamguages)
    langs=dataset.index.tolist()
    vectors_hash={}
    for l in langs:    

        binary_vector= np.zeros(11)

        #wordlength=dataset.loc[l]['Avg_length']
        wordlength=dataset.loc[l]['Median_length'] #for median

        #if Avg_length it is betwwen 0 and 1: assign 1 to the first bin (element of the array), and so on:
        
        #binary_vector[(round(wordlength))-1]=1 #binarization using round numbers
        binary_vector[(int(wordlength))-1]=1  #binarization using the integer part of the number
        #print (l, wordlength, round(wordlength), binary_vector)
        #print (l, wordlength, binary_vector)
        
        vectors_hash[l]= binary_vector
        
    
    return(pd.DataFrame.from_dict(vectors_hash).transpose()) #a dataframe with one vector per row 

Results using binarization based on the integer part of the wod length. Example:

If a language has word_length 4.9, its vector will have  "1" in the position 4.  Bins like 0 -1, 1 - 2, etc.

0 0 0 1 0 0 0 ...

In [21]:
entropies_features=get_entropy(get_wordlength_vectors(xcopa_10k)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xquad_10k)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(tydiqa_10k)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xnli_10k)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xtreme_10k)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xglue_10k)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(ud_10k)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(teddi_10k)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(mbert_10k)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(bibles_10k)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.33574829724076866
xquad 0.3198532291498096
tydiqa 0.36103779596241575
xnli 0.33874266714204293
xtreme 0.330476369909514
xglue 0.31592521410599766
ud 0.3423811073664347
teddi 0.3543710835203777
mbert 0.3467294224091484
bibles 0.31059402896766475


---

- Results using the rounded version of the the word length for the binarization. Example:

If a language has word_length 4.9, its vector will have "1" in the position 5.

0 0 0 0 1 0 0 ...

If a language has word_length 4.1, its vector will have "1" in the position 4.

0 0 0 1 0 0 0 ...




In [22]:
entropies_features=get_entropy(get_wordlength_vectors(xcopa_10k)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xquad_10k)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(tydiqa_10k)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xnli_10k)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xtreme_10k)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xglue_10k)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(ud_10k)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(teddi_10k)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(mbert_10k)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(bibles_10k)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.33574829724076866
xquad 0.3198532291498096
tydiqa 0.36103779596241575
xnli 0.33874266714204293
xtreme 0.330476369909514
xglue 0.31592521410599766
ud 0.3423811073664347
teddi 0.3543710835203777
mbert 0.3467294224091484
bibles 0.31059402896766475


----
Binarization using the median:

In [23]:
entropies_features=get_entropy(get_wordlength_vectors(xcopa_10k)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xquad_10k)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(tydiqa_10k)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xnli_10k)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xtreme_10k)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(xglue_10k)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(ud_10k)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(teddi_10k)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(mbert_10k)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors(bibles_10k)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.33574829724076866
xquad 0.3198532291498096
tydiqa 0.36103779596241575
xnli 0.33874266714204293
xtreme 0.330476369909514
xglue 0.31592521410599766
ud 0.3423811073664347
teddi 0.3543710835203777
mbert 0.3467294224091484
bibles 0.31059402896766475


---
*Using varied size of bins:

In [24]:
def get_wordlength_vectors2(dataset, binsize):
    #For each language initialize an array of elements (the maximum possoble word length in all lamguages)
    
    bins=np.arange(1, 11.2, binsize) #[ 1. ,  1.1,  1.2,  1.3... ]
    
    langs=dataset.index.tolist()
    vectors_hash={}
    
    for l in langs:    

        binary_vector= np.zeros(len(bins))

        wordlength=dataset.loc[l]['Avg_length']
        index=len(np.arange(1, wordlength, binsize)) #we partition the word length in the same bins, the total size of the array is the index for putting a 1 in the binary vector
        #print(l, wordlength, index)
        #index=bins.round(decimals=2).tolist().index(round(wordlength,1)) #locate the index that has that word length (index starts at zero)
       

        #if Avg_length it is betwwen 0 and 1: assign 1 to the first bin (element of the array), and so on:
        
        binary_vector[index-1]=1  
        
        vectors_hash[l]= binary_vector
        
    
    return(pd.DataFrame.from_dict(vectors_hash).transpose()) #a dataframe with one vector per row 

In [25]:
print("Bin Size: 1")

print("features:", str(len(np.arange(1, 11.2, 1))))

Bin Size: 1
features: 11


In [26]:
entropies_features=get_entropy(get_wordlength_vectors2(xcopa_10k, 1)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xquad_10k, 1)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(tydiqa_10k, 1)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xnli_10k, 1)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xtreme_10k, 1)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xglue_10k, 1)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(ud_10k, 1)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(teddi_10k, 1)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(mbert_10k, 1)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(bibles_10k, 1)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.33574829724076866
xquad 0.31797033139358055
tydiqa 0.3433145652165989
xnli 0.32074805380686433
xtreme 0.31093208189005345
xglue 0.29725431428286975
ud 0.3370770502591743
teddi 0.3611454857345919
mbert 0.31640987282092636
bibles 0.30242304086676636


In [27]:
print("Bin Size: 0.5")

print("features:", str(len(np.arange(1, 11.2, 0.5))))

Bin Size: 0.5
features: 21


In [28]:
entropies_features=get_entropy(get_wordlength_vectors2(xcopa_10k, 0.5)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xquad_10k, 0.5)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(tydiqa_10k, 0.5)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xnli_10k, 0.5)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xtreme_10k, 0.5)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xglue_10k, 0.5)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(ud_10k, 0.5)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(teddi_10k, 0.5)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(mbert_10k, 0.5)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(bibles_10k, 0.5)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.20236191626308056
xquad 0.2110937317508031
tydiqa 0.19307831920574794
xnli 0.20959064100879377
xtreme 0.20837922613984278
xglue 0.19360859635126332
ud 0.2236461098354233
teddi 0.23474913097437664
mbert 0.21359215870427253
bibles 0.21033574347935505


In [29]:
#get_wordlength_vectors2(teddi_10k, 0.5).to_csv("test.csv")

In [30]:
print("Bin Size: 0.1")

print("features:", str(len(np.arange(1, 11.2, 0.1))))

Bin Size: 0.1
features: 102


In [31]:
entropies_features=get_entropy(get_wordlength_vectors2(xcopa_10k, 0.1)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xquad_10k, 0.1)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(tydiqa_10k, 0.1)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xnli_10k, 0.1)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xtreme_10k, 0.1)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xglue_10k, 0.1)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(ud_10k, 0.1)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(teddi_10k, 0.1)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(mbert_10k, 0.1)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(bibles_10k, 0.1)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.04739673388369262
xquad 0.04868433532983926
tydiqa 0.043574076271849775
xnli 0.05059000844932728
xtreme 0.05680398812695187
xglue 0.05004464749022719
ud 0.06485724184042974
teddi 0.06560651691019755
mbert 0.06344832411737715
bibles 0.06254494978857637


In [32]:
print("Bin Size: 0.9")

print("features:", str(len(np.arange(1, 11.2, 0.9))))

Bin Size: 0.9
features: 12


In [33]:
entropies_features=get_entropy(get_wordlength_vectors2(xcopa_10k, 0.9)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xquad_10k, 0.9)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(tydiqa_10k, 0.9)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xnli_10k, 0.9)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xtreme_10k, 0.9)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xglue_10k, 0.9)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(ud_10k, 0.9)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(teddi_10k, 0.9)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(mbert_10k, 0.9)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(bibles_10k, 0.9)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies


xcopa 0.27950363755876206
xquad 0.3017539109227398
tydiqa 0.30776927247070457
xnli 0.29401904932295897
xtreme 0.3063741359892156
xglue 0.25649337673292716
ud 0.3187610112633042
teddi 0.33328568075800086
mbert 0.3006660186898331
bibles 0.30494797843982663


In [34]:
print("Bin Size: 0.8")

print("features:", str(len(np.arange(1, 11.2, 0.8))))

Bin Size: 0.8
features: 13


In [35]:
entropies_features=get_entropy(get_wordlength_vectors2(xcopa_10k, 0.8)) #entropies feature-wise
print("xcopa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xquad_10k, 0.8)) #entropies feature-wise
print("xquad", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(tydiqa_10k, 0.8)) #entropies feature-wise
print("tydiqa", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xnli_10k, 0.8)) #entropies feature-wise
print("xnli", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xtreme_10k, 0.8)) #entropies feature-wise
print("xtreme", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(xglue_10k, 0.8)) #entropies feature-wise
print("xglue", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(ud_10k, 0.8)) #entropies feature-wise
print("ud", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(teddi_10k, 0.8)) #entropies feature-wise
print("teddi", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(mbert_10k, 0.8)) #entropies feature-wise
print("mbert", mean(entropies_features)) #mean of entropies

entropies_features=get_entropy(get_wordlength_vectors2(bibles_10k, 0.8)) #entropies feature-wise
print("bibles", mean(entropies_features)) #mean of entropies

xcopa 0.29049693979866065
xquad 0.2942449739504756
tydiqa 0.30549351966050564
xnli 0.27140219937503907
xtreme 0.28044117924825046
xglue 0.2776086630367018
ud 0.30436492366351614
teddi 0.3245412407719476
mbert 0.29564414532318517
bibles 0.28887418064142617
