# ReferentialGym: Example Analysis of the Emerging Language

## Data processing:

In [1]:
import pickle
import os 
from tqdm import tqdm
import gc 

def dict_rec_update(d, u):
    if not(isinstance(u,dict)):
        #then d and u are lists:
        for el in u:
            d.append(el)
        return d 
    
    for k in u:
        if k in d:
            d[k] = dict_rec_update(d[k],u[k])
        else:
            d[k] = u[k]
    return d
    
def check_nbr_epoch_in_data(path):
    data = [dict()]
    dicts = [(int(fn[10:]), fn) for fn in os.listdir(path) if  'logs.dict.' in fn]
    dicts.sort(key=lambda x: x[0])
    
    d_path = dicts[-1][1]
    file = open(os.path.join(path, d_path), "rb")
    cdata = pickle.load(file)
    nbr_epoch = len(cdata)
    print("There are {} epochs (testing and training).".format(nbr_epoch))
    file.close()
    del cdata
    gc.collect()
    return nbr_epoch
    
def load_data(path, epoch=0, verbose=True):
    dicts = [(int(fn[10:]), fn) for fn in os.listdir(path) if  'logs.dict.' in fn]
    dicts.sort(key=lambda x: x[0])
    
    data = dict()
    for didx in tqdm(range(len(dicts))):
        d_path = dicts[didx][1]
        file = open(os.path.join(path, d_path), "rb")
        cdata = pickle.load(file)
        
        if verbose:
            print("Looking for epoch {}. Log {} contains {} epochs.".format(epoch,didx,len(cdata)), end='\r')
        if len(cdata) < epoch+1:
            file.close()
            gc.collect()
            continue      
        data = dict_rec_update(data, cdata[epoch])
        file.close()
        gc.collect()
        
    return data

In [2]:
path = './'

In [26]:
nbr_epoch = check_nbr_epoch_in_data(path)

There are 7 epochs (testing and training).


In [27]:
epoch = nbr_epoch-2
data = load_data(path, epoch=epoch, verbose=False)

100%|██████████| 11/11 [00:05<00:00,  1.77it/s]


In [28]:
len(data['os0']['decision'])

2703

In [29]:
for agent_id in data:
    for key in data[agent_id]:
        data[agent_id][key] = [ (d[0],d[1]) for d in data[agent_id][key] if d[1] is not None]

In [30]:
for k1 in data:
    it_decisions = [ (s[0],s[1]) for s in data[k1]['decision'] if s[1] is not None]
    it_sentences = [ (s[0],s[1]) for s in data[k1]['sentences_widx'] if s[1] is not None]
    k2 = 'decision'
    print(f"There are {len(it_decisions)} proprer decisions for agent {k1}.")
    k2 = 'sentences_widx'
    print(f"There are {len(it_sentences)} proprer sentences for agent {k1}.")
    #print(f"Final iteration idx: {data[k1][k2][-1][0]}")

There are 0 proprer decisions for agent ol0.
There are 2703 proprer sentences for agent ol0.
There are 2703 proprer decisions for agent os0.
There are 0 proprer sentences for agent os0.


In [31]:
agents = [ values for k,values in data.items()]

In [32]:
agents[0].keys()

dict_keys(['decision', 'sentences_widx', 'sentences_logits', 'sentences_one_hot', 'temporal_features'])

## Levenshtein Distance:

In [33]:
# https://www.python-course.eu/levenshtein_distance.php
def compute_levenshtein_distance(s1, s2):
    rows = len(s1)+1
    cols = len(s2)+1
    dist = [[0 for x in range(cols)] for x in range(rows)]
    # source prefixes can be transformed into empty strings 
    # by deletions:
    for i in range(1, rows):
        dist[i][0] = i
    # target prefixes can be created from an empty source string
    # by inserting the characters
    for i in range(1, cols):
        dist[0][i] = i
    
    # From there, we can compute iteratively how many steps
    # are needed to transform the source prefix (at col) into
    # the target prefix (at row):
    for i in range(1, rows):
        for j in range(1, cols):
            if s1[i-1] == s2[j-1]:
                cost = 0
            else:
                cost = 1
            dist[i][j] = min(dist[i-1][j] + 1,      # deletion
                                 dist[i][j-1] + 1,      # insertion
                                 dist[i-1][j-1] + cost) # substitution
    return float(dist[-1][-1])

In [34]:
compute_levenshtein_distance([0,1,2,3],[0,2,3,4])

2.0

In [35]:
print(agents[0]['sentences_widx'][4][1])
print(agents[0]['sentences_widx'][100][1])
compute_levenshtein_distance(agents[0]['sentences_widx'][4][1],agents[0]['sentences_widx'][100][1])

[[4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]]
[[4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]
 [4.]]


0.0

## Cosine Similarity:

In [36]:
import numpy as np
from numpy import linalg as LA

In [37]:
def compute_cosine_sim(v1, v2):
    v1_norm = LA.norm(v1)
    v2_norm = LA.norm(v2)
    cos_sim = np.matmul(v1/v1_norm,(v2/v2_norm).transpose())
    return cos_sim

In [38]:
print(agents[0]['temporal_features'][4][1].shape)
print(agents[0]['temporal_features'][100][1].shape)
compute_cosine_sim(agents[0]['temporal_features'][4][1],agents[0]['temporal_features'][100][1])

(64,)
(64,)


0.9801021

##  Measure of Compositionality as Topographic Similarity (Negative Spearman Correlation):

In [39]:
from scipy.stats import spearmanr

In [40]:
def compute_topographic_similarity(sentences,features,comprange=100):
    levs = []
    for idx1 in tqdm(range(len(sentences))):
        s1 = sentences[idx1]
        tillidx = min(len(sentences)-1,idx1+1+comprange)
        for idx2, s2 in enumerate(sentences[idx1+1:tillidx]): 
            levs.append( compute_levenshtein_distance(s1,s2))
    cossims = []
    for idx1 in tqdm(range(len(features))):
        f1 = features[idx1]
        tillidx = min(len(sentences)-1,idx1+1+comprange)
        for idx2, f2 in enumerate(features[idx1+1:tillidx]): 
            cossims.append( compute_cosine_sim(f1,f2))
    
    rho, p = spearmanr(levs, cossims)
    return -rho, p, levs, cossims  

In [41]:
#%pdb on
agent_idx = 0
nbr_samples = 10000
it_sentences = [ (s[0],s[1]) for s in agents[agent_idx]['sentences_widx'][:nbr_samples] if s[1] is not None]
sentences = [s[1] for s in it_sentences]
features = [] 

sentence_idx = 0 
for idx, (it,tf) in enumerate(agents[agent_idx]['temporal_features']):
    if len(features) == len(sentences): 
            break
    
    sentence_it = it_sentences[sentence_idx][0]
    if sentence_it == it:
        features.append(tf)
        sentence_idx += 1

comprange = 10000
rho, p, levs, cossims = compute_topographic_similarity(sentences=sentences, features=features, comprange=comprange)

100%|██████████| 2703/2703 [13:19<00:00,  3.38it/s]
100%|██████████| 2703/2703 [01:16<00:00, 35.24it/s] 


In [43]:
print("Topographic similarity between the language space and the visual feature space: {} (with p={}).".format(rho,p))

Topographic similarity between the language space and the visual feature space: 0.7979867572986369 (with p=0.0).


In [44]:
zero_lev_dist_card = len([l for l in levs if l==0])
lev_dist_card = len(levs)
print("Number of 0-Levenshtein-distance pairs: {} / {} :: {}%.".format(zero_lev_dist_card, lev_dist_card, float(zero_lev_dist_card)/lev_dist_card*100.0))

threshold = 0.1
thresholded_cossim_dist_card = len([c for c in cossims if abs(c)< threshold])
cossims_dist_card = len(cossims)
print("Number of less-than-{}-cosine-distance pairs: {} / {} :: {}%.".format(threshold, thresholded_cossim_dist_card, cossims_dist_card, float(thresholded_cossim_dist_card)/cossims_dist_card*100.0))


Number of 0-Levenshtein-distance pairs: 661782 / 3649051 :: 18.13572898816706%.
Number of less-than-0.1-cosine-distance pairs: 0 / 3649051 :: 0.0%.


## Ambiguity:

In [45]:
def cardinality(data):
    if isinstance(data[0], np.ndarray):
        data_array = np.concatenate([np.expand_dims(d, 0) for d in data], axis=0)
        data_set = np.unique(data_array, axis=0)
    else:
        data_set = set(data)
    return len(data_set)

In [46]:
vocab_size = np.max(np.concatenate([ np.array([symbol for symbol in sentence]) for sentence in sentences]))
print("The size of the vocabulary: {} symbols.".format(vocab_size))

str_s = [ ''.join([chr(97+int(symbol)) for symbol in sentence if symbol<vocab_size]) for sentence in sentences]
nbr_unique_sentences = cardinality(str_s)
nbr_unique_stimulus = cardinality(features)

print("There are {} unique sentences out of {} different stimuli :: {} %.".format(nbr_unique_sentences, nbr_unique_stimulus, float(nbr_unique_sentences)/nbr_unique_stimulus*100.0))
print("Ambiguity: {} %.".format(float(nbr_unique_stimulus-nbr_unique_sentences)/nbr_unique_stimulus*100.0))

sentence_lengths = [ len([symbol for symbol in sentence if symbol<vocab_size]) for sentence in sentences]

'''
fullstr_s = [ ''.join([chr(97+int(symbol)) for symbol in sentence]) for sentence in sentences]
fullsentence_lengths = [ (len(sentence.replace('f','')), sentence) for sentence in fullstr_s]
for i in range(1000):
    print(fullsentence_lengths[i:i+1])
'''

min_s_length = min(sentence_lengths)
mean_s_length = np.mean(np.array(sentence_lengths))
print("The minimum sentence length is: {}. The average sentence length is: {}.".format(min_s_length, mean_s_length))

The size of the vocabulary: 4.0 symbols.
There are 386 unique sentences out of 2602 different stimuli :: 14.834742505764797 %.
Ambiguity: 85.1652574942352 %.
The minimum sentence length is: 0. The average sentence length is: 6.256751757306696.


## Ambiguity-regularized Compositionality:

In [47]:
amb = float(nbr_unique_stimulus-nbr_unique_sentences)/nbr_unique_stimulus
print("Ambiguity-regularized topographic similarity: {} (with p={}).".format(rho*(1-amb),p))

Ambiguity-regularized topographic similarity: 0.11837928067535505 (with p=0.0).


## Compositionality of unique sentences:

In [56]:
#%pdb on
agent_idx = 0
nbr_samples = 20000

it_sentences = [ (s[0],s[1]) for s in agents[agent_idx]['sentences_widx'][:nbr_samples] if s[1] is not None]

sentences = [s[1] for s in it_sentences]
np_sentences = np.concatenate( sentences, axis=1).transpose(1,0)

_, idx_unique_sentences = np.unique(np_sentences, axis=0, return_index=True)
idx_unique_sentences = sorted(idx_unique_sentences)

unique_it_sentences = [ it_sentences[idx] for idx in idx_unique_sentences]
unique_sentences = [s[1] for s in unique_it_sentences]

features = [ tf for idx, (it, tf) in enumerate(agents[agent_idx]['temporal_features'][:nbr_samples]) if idx in idx_unique_sentences]

print("There are {} unique sentences out of the {} sampled sentences.".format(len(unique_sentences), len(it_sentences)))

comprange = 20000
rho, p, levs, cossims = compute_topographic_similarity(sentences=unique_sentences, features=features, comprange=comprange)

  0%|          | 0/632 [00:00<?, ?it/s]

There are 632 unique sentences out of the 2703 sampled sentences.


100%|██████████| 632/632 [00:45<00:00, 13.92it/s]
100%|██████████| 632/632 [00:03<00:00, 183.74it/s]


In [57]:
print("Ambiguity-agnostic Topographic similarity: {} (with p={}).".format(rho,p))

Ambiguity-agnostic Topographic similarity: 0.8186521349080202 (with p=0.0).
