## Download data from Zenodo

In [None]:
import requests
import os
datasets_to_download_zenodo = [('refwug', 'German', '1.1.0', 'https://zenodo.org/records/5791269/files/refwug.zip?download=1', ''), 
                            ('durel', 'German', '3.0.0', 'https://zenodo.org/records/5784453/files/durel.zip?download=1', ''),
                            ('surel', 'German', '3.0.0', 'https://zenodo.org/records/5784569/files/surel.zip?download=1', ''), 
                            ('chiwug', 'Chinese', '1.0.0', 'https://zenodo.org/records/10023263/files/chiwug.zip?download=1', ''),
                            ('dwug_de', 'German', '3.0.0', 'https://zenodo.org/records/14028509/files/dwug_de.zip?download=1', ''),
                            ('dwug_en', 'English', '3.0.0', 'https://zenodo.org/records/14028531/files/dwug_en.zip?download=1', ''),
                            ('dwug_sv', 'Swedish', '3.0.0', 'https://zenodo.org/records/14028906/files/dwug_sv.zip?download=1', ''),
                            ('dwug_de_resampled', 'German', '1.0.0', 'https://zenodo.org/records/12670698/files/dwug_de_resampled.zip?download=1', ''),
                            ('dwug_en_resampled', 'English', '1.0.0', 'https://zenodo.org/records/14025941/files/dwug_en_resampled.zip?download=1', ''),
                            ('dwug_sv_resampled', 'Swedish', '1.0.0', 'https://zenodo.org/records/14026615/files/dwug_sv_resampled.zip?download=1', ''),
                            ('discowug', 'German', '2.0.0', 'https://zenodo.org/records/14028592/files/discowug.zip?download=1', ''),
                            ('dwug_es', 'Spanish', '4.0.2', 'https://zenodo.org/records/14891659/files/dwug_es.zip?download=1', ''),
                            ('diawug', 'Spanish', '1.1.2', 'https://zenodo.org/records/14891461/files/diawug.zip?download=1', ''),
                                ]

if not os.path.exists('data/'):
    os.makedirs('data/')      

for name, language, version, link, path_to_data in datasets_to_download_zenodo:
    r = requests.get(link, allow_redirects=True)
    f = 'data/' + name + '.zip'
    open(f, 'wb').write(r.content)

## Download data from github

In [None]:
datasets_to_download_github = [('rusemshift_1', 'Russian', '', 'https://github.com/Garrafao/RuSemShift/archive/refs/heads/correct-indices.zip', 'RuSemShift-correct-indices/rusemshift_1/DWUG/'), 
                                ('rusemshift_2', 'Russian', '', 'https://github.com/Garrafao/RuSemShift/archive/refs/heads/correct-indices.zip', 'RuSemShift-correct-indices/rusemshift_2/DWUG/'), 
                                ('rushifteval1', 'Russian', '', 'https://github.com/Garrafao/rushifteval_public/archive/refs/heads/correct-indices.zip', 'rushifteval_public-correct-indices/durel/rushifteval1/'), 
                                ('rushifteval2', 'Russian', '', 'https://github.com/Garrafao/rushifteval_public/archive/refs/heads/correct-indices.zip', 'rushifteval_public-correct-indices/durel/rushifteval2/'), 
                                ('rushifteval3', 'Russian', '', 'https://github.com/Garrafao/rushifteval_public/archive/refs/heads/correct-indices.zip', 'rushifteval_public-correct-indices/durel/rushifteval3/'), 
                                ('rudsi', 'Russian', '', 'https://github.com/Garrafao/RuDSI/archive/refs/heads/correct-indices.zip', 'RuDSI-correct-indices/'), 
                                ('nordiachange1', 'Norwegian', '', 'https://github.com/Garrafao/nor_dia_change/archive/refs/heads/correct-indices.zip', 'nor_dia_change-correct-indices/subset1/'), 
                                ('nordiachange2', 'Norwegian', '', 'https://github.com/Garrafao/nor_dia_change/archive/refs/heads/correct-indices.zip', 'nor_dia_change-correct-indices/subset2/')
                              ]
for name, language, version, link, path_to_data in datasets_to_download_github:
    r = requests.get(link, allow_redirects=True)
    f = 'data/' + name + '.zip'
    open(f, 'wb').write(r.content)

In [None]:
datasets_all = datasets_to_download_zenodo + datasets_to_download_github

languages_global = ['German', 'English', 'Swedish', 'Spanish', 'Chinese', 'Russian', 'Norwegian']

## Unzip data and remove superfluous files

In [None]:
import zipfile
import shutil

for name, language, version, link, path_to_data in datasets_all:
    if not os.path.exists('data/' + name):
        os.makedirs('data/' + name)
    else:
        shutil.rmtree('data/' + name)        
        os.makedirs('data/' + name)
    if path_to_data == '':
        with zipfile.ZipFile('data/' + name + '.zip') as z:
            z.extractall('data/temp')
        dest = shutil.move('data/temp/' + name + '/data', 'data/' + name)  
        if os.path.exists('data/temp/' + name + '/clusters'):
            dest = shutil.move('data/temp/' + name + '/clusters', 'data/' + name)  
    else:
        with zipfile.ZipFile('data/' + name + '.zip') as z:
            z.extractall('data/temp/' + name)
        dest = shutil.move('data/temp/' + name + '/' + path_to_data + '/data/', 'data/' + name + '/data')  
        if os.path.exists('data/temp/' + name + '/' + path_to_data + '/clusters/'):
            dest = shutil.move('data/temp/' + name + '/' + path_to_data + '/clusters/', 'data/' + name + '/clusters')  
    shutil.rmtree('data/temp/' + name)        

In [None]:
import pandas as pd
from pathlib import Path
import unicodedata
import numpy as np

# Load datasets into data frame
df_judgments = pd.DataFrame()
j = 0
i2lemma2name_judgments = []
for name, language, version, link, path_to_data in datasets_all:
    print(name)
    i = 0
    for p in Path('data/'+name+'/data').glob('*/judgments.csv'):
        #print(p)
        lemma = str(p).split('/')[-2]        
        lemma = unicodedata.normalize('NFC', lemma)
        df = pd.read_csv(p, delimiter='\t', quoting=3, na_filter=False)
        df['dataset'] = name
        df['language'] = language
        df['annotator'] = df['annotator'].astype(str) + '-' + name # make sure annotators are unique across datasets
        if name in ['chiwug']:            
            df['identifier1'] = df['identifier1'].astype(str) + '-' + str(i) # make sure identifiers are unique across words
            df['identifier2'] = df['identifier2'].astype(str) + '-' + str(i) # make sure identifiers are unique across words
        if name in ['rusemshift_1', 'rusemshift_2']: # only done four judgments for those datasets which will not be mapped later
            # don't do this for the German data where same identifiers mean same use
            df['identifier1'] = df['identifier1'].astype(str) + '-' + str(j) # make sure identifiers are unique across datasets
            df['identifier2'] = df['identifier2'].astype(str) + '-' + str(j) # make sure identifiers are unique across datasets
        df['judgment'] = df['judgment'].astype(float)
        #if df['judgment'].isnull().values.any():            
        #    display(df[df['judgment'].isnull()])
        df_judgments = pd.concat([df_judgments, df])
        i2lemma2name_judgments.append((i,lemma,name))
        i+=1
    j+=1
    
df_uses = pd.DataFrame()
j = 0
i2lemma2name_uses = []
for name, language, version, link, path_to_data in datasets_all:
    i = 0
    for p in Path('data/'+name+'/data').glob('*/uses.csv'):
        #print(p)
        lemma = str(p).split('/')[-2]        
        lemma = unicodedata.normalize('NFC', lemma)
        df = pd.read_csv(p, delimiter='\t', quoting=3, na_filter=False)
        df['dataset'] = name
        df['language'] = language
        if name in ['chiwug']:
            df['identifier'] = df['identifier'].astype(str) + '-' + str(i) # make sure identifiers are unique across words
            df['lemma'] = df['lemma'].apply(lambda x: unicodedata.normalize('NFC', x))
        if name in ['rushifteval1', 'rushifteval2', 'rushifteval3', 'rusemshift_1', 'rusemshift_2']:
            df['identifier'] = df['identifier'].astype(str) + '-' + str(j) # make sure identifiers are unique across datasets
        df_uses = pd.concat([df_uses, df])        
        i2lemma2name_uses.append((i,lemma,name))
        i+=1
    j+=1
    
df_clusters = pd.DataFrame()
j = 0
i2lemma2name_clusters = []
for name, language, version, link, path_to_data in datasets_all:
    i = 0    
    for p in Path('data/'+name+'/data').glob('*/uses.csv'): # read in uses to have same order of data for clusters
        lemma = str(p).split('/')[-2]        
        lemma = unicodedata.normalize('NFC', lemma)
        if os.path.exists('/'.join(str(p).split('/')[:-3] + ['clusters'])):
            p = '/'.join(str(p).split('/')[:-3] + ['clusters/opt/'+lemma+'.csv'])
            #print(p)
            df = pd.read_csv(p, delimiter='\t', quoting=3, na_filter=False)
            df['dataset'] = name
            df['language'] = language
            df['lemma'] = lemma
            if name in ['chiwug']:
                df['identifier'] = df['identifier'].astype(str) + '-' + str(i) # make sure identifiers are unique across words
            if name in ['rushifteval1', 'rushifteval2', 'rushifteval3', 'rusemshift_1', 'rusemshift_2']:
                df['identifier'] = df['identifier'].astype(str) + '-' + str(j) # make sure identifiers are unique across datasets
            df_clusters = pd.concat([df_clusters, df])        
        i2lemma2name_clusters.append((i,lemma,name))
        i+=1
    j+=1        

#print(i2lemma2name_judgments[:20])
#print(i2lemma2name_uses[:20])
#print(i2lemma2name_clusters[:20])
#print(set(i2lemma2name_clusters).difference(set(i2lemma2name_uses)))
assert i2lemma2name_judgments == i2lemma2name_uses == i2lemma2name_clusters
assert not 'nan' in df_judgments['identifier1'].astype(str).unique()
assert not 'nan' in df_judgments['identifier2'].astype(str).unique()
df_judgments_length_before_sorting = len(df_judgments)
df_judgments[['identifier1','identifier2']] = np.sort(df_judgments[['identifier1','identifier2']], axis=1) # sort within pairs to be able to aggregate
assert df_judgments_length_before_sorting == len(df_judgments)

display(len(df_judgments))
display(len(df_uses))

In [None]:
# Display a sample to validate
display(df_judgments.sample(n=10))
display(df_uses.sample(n=10))

## Clean and aggregate data

In [None]:
# Replace 0.0 judgments with nan
df = df_judgments.copy()
df['judgment'] = df['judgment'].replace(0.0, np.NaN)

# Aggregate use pairs and extract median column
df = df.groupby(['identifier1', 'identifier2', 'lemma', 'dataset'])['judgment'].apply(list).reset_index(name='judgments')
df['median_judgment'] = df['judgments'].apply(lambda x: np.nanmedian(list(x)))

# Remove pairs with nan median
#df = df[~df['median_judgment'].isnull()]
df_judgments_aggregated = df.copy()
display(df_judgments_aggregated)

## Make graphs from data and plot

In [None]:
import matplotlib.colors as mcolors
nice_colors = [x for x in mcolors.get_named_colors_mapping().values() if isinstance(x, str)] # Nice colors
colors_global = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00'] # color-blind colors
colors_global = colors_global + nice_colors + ['#000000']

In [None]:
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt

def draw_graph(graph, is_weighted=False, blocks=None):
    
    graph = graph.copy()
        
    if blocks is None:        
       blocks = [0 for node in graph.nodes()] 
    
    fig = plt.figure(1, figsize=(12, 12), dpi=60)

    if is_weighted:
    
        edges = graph.edges()
        weights = [graph[i][j]['weight'] for (i,j) in graph.edges()]

        # Find positions of nodes
        graph_pos = graph.copy()
        # edge weights are transformed for finding positions, pure weights don't reveal cluster structure in 2D
        edges_transformed = [(i,j,graph[i][j]['weight']**5) for (i,j) in graph.edges()]
        #print(weights_transformed)
        graph_pos.add_weighted_edges_from(edges_transformed)
        enan = [(u, v) for (u, v, d) in graph_pos.edges(data=True) if np.isnan(d['weight'])]        
        graph_pos.remove_edges_from(enan)  # Remove nan edges for finding positions
        #pos = graphviz_layout(graph_pos,prog='sfdp')   
        pos=nx.spring_layout(graph_pos)
        # Reduce picture size
        pos = {node:(p[0]/2,p[1]/2) for (node,p) in pos.items()} 

        # Draw graph
        nx.draw(graph, pos=pos,
                node_size=30, 
                node_color=np.array(colors_global)[blocks],
                edgelist=edges,
                edge_color=weights, 
                edge_cmap=plt.cm.Greys,
                edge_vmin=0.0,
                edge_vmax=4.0
               )
    
    else:
        nx.draw(graph, node_size=30, node_color=np.array(colors_global)[blocks])
        pos = {}
    plt.show()
    plt.close()
    return pos

# Select one data set for simplicity
df_judgments_aggregated_dataset = df_judgments_aggregated[df_judgments_aggregated['dataset'] == 'dwug_de_resampled']
df_uses_dataset = df_uses[df_uses['dataset'] == 'dwug_de_resampled']
df_clusters_dataset = df_clusters[df_clusters['dataset'] == 'dwug_de_resampled']

# Prepare the data
# Group by lemma
gb = df_judgments_aggregated_dataset.groupby('lemma')    
groups = gb.groups

gb_uses = df_uses_dataset.groupby('lemma')    
gb_clusters = df_clusters_dataset.groupby('lemma')    

# construct separately for each lemma
for word in groups.keys():
    print(word)
    df_group = gb.get_group(word)
    df_uses_group = gb_uses.get_group(word)
    df_clusters_group = gb_clusters.get_group(word)
    clusters = df_clusters_group[df_clusters_group['identifier'].isin(df_uses_group['identifier'])]['cluster']
    id2c = {identifier:clusters[i] for i, identifier in enumerate(df_uses_group['identifier'])}
    graph = nx.Graph()
    graph.add_nodes_from(df_uses_group['identifier'])

    # Add edge data to graph
    edges_weighted = list(zip(*[df_group['identifier1'], df_group['identifier2'], df_group['median_judgment']]))
    edges = list(zip(*[df_group['identifier1'], df_group['identifier2']]))
    graph.add_weighted_edges_from(edges_weighted)
    
    enan = [(u, v) for (u, v, d) in graph.edges(data=True) if np.isnan(d['weight'])] # get nan edges to remove
    graph.remove_edges_from(enan)  # remove nan edges
    isolates = list(nx.isolates(graph)) # get isolates to remove
    graph.remove_nodes_from(isolates) # remove isolates
    noise = [identifier for identifier, cluster in id2c.items() if cluster==-1] # get nodes with many 0-judgments (see DURel paper for explanation)
    graph.remove_nodes_from(noise) # remove noise cluster
    clusters_clean = [id2c[node] for node in graph.nodes()]

    # Draw graph
    draw_graph(graph, is_weighted=True, blocks=clusters_clean)