In [3]:
%matplotlib widget
import os
import sys
import numpy as np
from collections import defaultdict, Counter
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 6]
from pandas import DataFrame
import lang2vec.lang2vec as l2v
from statistics import mean

## Jaccard Similarity (Jmm_syn)

1. Functions for calculating Jaccard's similarity:

In [8]:
def get_dict(arg1,arg2,arg3,sourcedata): #it returns a dataframe with bins and a dictionary {region:number of languages} based on the source data (measures)
  min=arg1
  max=arg2
  increment=arg3
  #np.lin... loadspace(min,max,24)
  bins=np.arange(min, max, increment)
  #clc_10k
  data_regions = pd.DataFrame(columns=['Avg_length', 'Median_length', 'Char_types', 'Types','Tokens','TTR','H','region'])

  data_regions_freq=dict()
  for i in bins:
    aux=pd.DataFrame(sourcedata.loc[(sourcedata['Avg_length']>i) & (sourcedata['Avg_length']<(i+increment))])
    region=str(i)+"-"+str(i+increment)
    data_regions_freq[region]=len(aux)  #hash with the number of elements per each region
    aux['region']=region
    data_regions= pd.concat([data_regions, aux], axis=0)

  return (data_regions, data_regions_freq)
################################
def scaler1(data1, data2): #Input two dictionaries  {bin:number of languages}, it returns an scaled version (each dictionary is normalized indepedently)
  a=np.array(list(data1.values())).sum()
  b=np.array(list(data2.values())).sum()
  scaled=dict()
  if (a>b):
    max=a
    min=b
    c=max/min
    for key in data2:  #we apply constant c to the set with smallest cardinality 
      scaled[key]=data2[key]*c
    return(data1, scaled)
  else:
    max=b
    min=a
    c=max/min
    for key in data1:  #we apply constant c to the set with smallest cardinality 
      scaled[key]=data1[key]*c
    return(scaled, data2)

###################################################
def jaccard_index(data1, data2): # Input two dictionaries  {bin:number of languages}
  union=dict()
  intersection=dict()
  intersectionvalues=[]
  unionvalues=[]
  for key in data1: #both dics have the same keys
  #first define the union, for each class/bin choose the one with the highest value
    if (data1[key] > data2[key]):
      union[key]=data1[key]
      unionvalues.append(union[key])
      
    else:
      union[key]=data2[key]
      unionvalues.append(union[key])

    
  #Then the intersection: which  have both  values, choose the smallest 
    if (data1[key] !=0) and (data2[key]!=0):
      if (data1[key] < data2[key]):
        intersection[key]=data1[key]
        intersectionvalues.append(intersection[key])
      else:
        intersection[key]=data2[key]
        intersectionvalues.append(intersection[key])

  jaccard=np.array(intersectionvalues).sum()/np.array(unionvalues).sum()
      
  return(jaccard, union, intersection, unionvalues, intersectionvalues)

###################################3
def draw_overlap_plot(frame1,frame2,label1,label2, list1, list2):  
  frame1.columns=[label1]
  frame2.columns=[label2]
  col1=frame1  #from here we'll make barplot 1
  col2=frame2 #from here we'll make barplot 2
  #Preparing subplot:
  fig, ax = plt.subplots()
  ax2 = ax.twinx()
  plot1=col1.plot(kind='bar', ax=ax, color='orange', width=1, align="edge", alpha=0.4)
  plot2=col2.plot(kind='bar', ax=ax2, alpha=0.5,width=1, color='palegreen', align="edge")
  positions = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12)  #fixed for now, imporve later
  labels = ("1", "2", "3", "4", "5", "6","7", "8", "9", "10", "11", "12", "13") #fixed for now, improve later

  plt.setp(ax, xticks= positions, xticklabels=labels)
  ax.tick_params(labelrotation=0, labelsize=12)
  ax2.tick_params(labelsize=12)
  ax.legend(fontsize=14)
  ax2.legend([label2], loc=('upper left'), fontsize=14)

  ax2.xaxis.set_visible(False)
  #ax.set_ylim(ax2.get_ylim())
  ax.set_ylim(top=50)  #set fixed axis
  ax2.set_ylim(top=50)  #set fixed axis

  ax.set_xlabel('Mean word length', fontsize=14)

  ###We Print Jaccard's score in the plot:

  jacc=jaccard_index(list1,list2)[0]  #Jaccard's index (scaling2)
  textstr="J="+str(round(jacc,3))
  plt.gcf().text(0.5, 0.8, textstr, fontsize=14)


2. Obtain syntax feature vectors form lang2vec

In [9]:
xcopa_codes=pd.read_csv('../Data/isomappings/xcopa-processed.10000.csv', index_col=0)
xquad_codes=pd.read_csv('../Data/isomappings/xquad-processed.10000.csv', index_col=0)
tydiqa_codes=pd.read_csv('../Data/isomappings/tydiqa-processed.10000.csv', index_col=0)
xnli_codes=pd.read_csv('../Data/isomappings/xnli-processed.10000.csv', index_col=0)
xtreme_codes=pd.read_csv('../Data/isomappings/xtreme-processed.10000.csv', index_col=0)
xglue_codes=pd.read_csv('../Data/isomappings/xglue-processed.10000.csv', index_col=0)
ud_codes=pd.read_csv('../Data/isomappings/ud-processed.tsv', sep='\t', index_col=0)
teddi_codes=pd.read_csv('../Data/isomappings/teddi500.csv', index_col=0)
mbert_codes=pd.read_csv('../Data/isomappings/mbertwiki-processed.10000.csv', index_col=0)
bible_codes=pd.read_csv('../Data/isomappings/biblecorpus100-processed.10000.csv', index_col=0)

In [10]:
#manual substitutions for problematic cases
ud_codes.loc["UD_Western_Armenian-ArmTDP.txt"].at["ISO_6393"]="hy"
mbert_codes.loc["armenian"].at["ISO_6393"]="hy"
mbert_codes.loc["vowiki-latest-pages-articles"].at["ISO_6393"]="vol"
bible_codes=bible_codes.drop(["crp.txt"])
bible_codes.loc["jap.txt"].at["ISO_6393"]="jpn"


Function for exracting the vectors according to the language codes:

In [11]:
def get_l2v(dataset_codes):
    #list of iso codes to query the l2v vectors:
    codes=dataset_codes["ISO_6393"].str.lower().tolist()
    #codes=xcopa_codes.index.tolist()
    
    features = l2v.get_features(codes, "syntax_knn")
    #features = l2v.get_features(codes, "syntax_average")

    features_frame = pd.DataFrame.from_dict(features).transpose()
    return (features_frame)

Addapting to the variables names, so we can run the jaccard's code:

In [12]:
df_clc_10k_freqs=get_l2v(teddi_codes).sum().to_frame() #We obtain the sum per each column (feature), these are feature and frequency pairs
df_xcopa_10k_freqs=get_l2v(xcopa_codes).sum().to_frame()
df_xquad_10k_freqs=get_l2v(xquad_codes).sum().to_frame()
df_tydiqa_10k_freqs=get_l2v(tydiqa_codes).sum().to_frame()
df_xnli_10k_freqs=get_l2v(xnli_codes).sum().to_frame()
df_xtreme_10k_freqs=get_l2v(xtreme_codes).sum().to_frame()
df_xglue_10k_freqs=get_l2v(xglue_codes).sum().to_frame()
df_ud_10k_freqs=get_l2v(ud_codes).sum().to_frame()
df_mbert_10k_freqs=get_l2v(mbert_codes).sum().to_frame()
df_bibles_10k_freqs=get_l2v(bible_codes).sum().to_frame()

bibles_10k_freqs=get_l2v(bible_codes).sum().to_dict()
clc_10k_freqs=get_l2v(teddi_codes).sum().to_dict()
xcopa_10k_freqs=get_l2v(xcopa_codes).sum().to_dict()
xquad_10k_freqs=get_l2v(xquad_codes).sum().to_dict()
tydiqa_10k_freqs=get_l2v(tydiqa_codes).sum().to_dict()
xnli_10k_freqs=get_l2v(xnli_codes).sum().to_dict()
xtreme_10k_freqs=get_l2v(xtreme_codes).sum().to_dict()
xglue_10k_freqs=get_l2v(xglue_codes).sum().to_dict()
ud_10k_freqs=get_l2v(ud_codes).sum().to_dict()
mbert_10k_freqs=get_l2v(mbert_codes).sum().to_dict()

---


**Teddi (100LC) vs UD**

In [13]:
#Scaling1 #Basic normalization (dividing by the number of languages) ~probabilities
ud_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(ud_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_ud_10k_freqs_norm1=pd.DataFrame.from_dict(ud_10k_freqs_norm1, orient='index')

Jaccard's schore between Teddi and UD:

In [15]:
print("Jaccard's index (no scaling)", jaccard_index(ud_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(ud_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.6545731044970672
Jaccard's index (scaling1) 0.736327204327871




---


**100 BIBLE CORPUS**

In [17]:
#normalizing1:
bibles_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(get_l2v(bible_codes).sum().to_dict(),get_l2v(teddi_codes).sum().to_dict())
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_bibles_10k_freqs_norm1=pd.DataFrame.from_dict(bibles_10k_freqs_norm1, orient='index')


Jaccard score between 100 Bible corpus and 100LC :

In [18]:
print("Jaccard's index (no scaling)", jaccard_index(bibles_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(bibles_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.7575830524795378
Jaccard's index (scaling1) 0.8109984079295679




---


**XCOPA**


In [19]:
#normalizing1:
xcopa_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(xcopa_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_xcopa_10k_freqs_norm1=pd.DataFrame.from_dict(xcopa_10k_freqs_norm1, orient='index')

Jaccard score between Xcopa corpus and 100LC:

In [20]:
print("Jaccard's index (no scaling)", jaccard_index(xcopa_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(xcopa_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.1189767995240928
Jaccard's index (scaling1) 0.7367013445714917




---


**TyDiQA**

In [21]:
df_tydiqa_10k_freqs=pd.DataFrame.from_dict(tydiqa_10k_freqs, orient='index')
#normalizing1:
tydiqa_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(tydiqa_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_tydiqa_10k_freqs_norm1=pd.DataFrame.from_dict(tydiqa_10k_freqs_norm1, orient='index')

Jaccard's score between 100LC and TydiQA

In [22]:
print("Jaccard's index (no scaling)", jaccard_index(tydiqa_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(tydiqa_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.12526033918476645
Jaccard's index (scaling1) 0.7512151925836588




---
**XQUAD**


In [23]:
#normalizing1:
xquad_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(xquad_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_xquad_10k_freqs_norm1=pd.DataFrame.from_dict(xquad_10k_freqs_norm1, orient='index')

Jaccard's score between 100LC and Xquad

In [24]:
print("Jaccard's index (no scaling)", jaccard_index(xquad_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(xquad_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.1430696014277216
Jaccard's index (scaling1) 0.68042417173084




---
**XNLI**


In [26]:
#normalizing1:
xnli_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(xnli_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_xnli_10k_freqs_norm1=pd.DataFrame.from_dict(xnli_10k_freqs_norm1, orient='index')

In [27]:
print("Jaccard's index (no scaling)", jaccard_index(xnli_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(xnli_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.17846519928613921
Jaccard's index (scaling1) 0.7105759671292086




---
**XGLUE**

In [28]:

#normalizing1:
xglue_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(xglue_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_xglue_10k_freqs_norm1=pd.DataFrame.from_dict(xglue_10k_freqs_norm1, orient='index')

Jaccard's score between 100LC and Xglue:

In [29]:
print("Jaccard's index (no scaling)", jaccard_index(xglue_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(xglue_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.22665080309339677
Jaccard's index (scaling1) 0.6742992776098217




---
**XTREME**



In [30]:
#normalizing1:
xtreme_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(xtreme_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_xtreme_10k_freqs_norm1=pd.DataFrame.from_dict(xtreme_10k_freqs_norm1, orient='index')

Jaccard's score between 100LC and XTREME:

In [31]:
print("Jaccard's index (no scaling)", jaccard_index(xtreme_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(xtreme_10k_freqs_norm1,clc_10k_freqs_norm1)[0])

Jaccard's index (no scaling) 0.46179006839131725
Jaccard's index (scaling1) 0.775188919443282




---
**Mbert (wiki data)**


In [32]:
#normalizing1:
mbert_10k_freqs_norm1, clc_10k_freqs_norm1=scaler1(mbert_10k_freqs,clc_10k_freqs)
df_clc_10k_freqs_norm1=pd.DataFrame.from_dict(clc_10k_freqs_norm1, orient='index')
df_mbert_10k_freqs_norm1=pd.DataFrame.from_dict(mbert_10k_freqs_norm1, orient='index')

Jaccard's score between 100LC and MBERT:

In [33]:
print("Jaccard's index (no scaling)", jaccard_index(mbert_10k_freqs,clc_10k_freqs)[0])
print("Jaccard's index (scaling1)", jaccard_index(mbert_10k_freqs_norm1,clc_10k_freqs_norm1)[0])Teddi

Jaccard's index (no scaling) 0.6730947955390335
Jaccard's index (scaling1) 0.7104482110006572
