In [1]:
from tqdm.notebook import tqdm
import datasets
from scipy.sparse import csr_matrix
from collections import Counter, defaultdict
import numpy as np
from sklearn.cluster import KMeans

from scipy.spatial import distance
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
from functools import partial
import seaborn as sns

import semiolog as slg

In [2]:
sns.set(
    rc = {'figure.figsize':(25,20)},
    font="Courier"
    )

def plot(vector,
    xticklabels = None,
    yticklabels = None,
    vmin = None,
    vmax = None,
    labelbottom = False,
    labelright = False,
    save = None,

    ):
    if xticklabels == None:
        xticklabels = list(range(vector.shape[0]))
    if yticklabels == None:
        yticklabels = list(range(vector.shape[1]))
    hm = sns.heatmap(
        vector,
        xticklabels=xticklabels,
        yticklabels=yticklabels,
        linewidths=.5,
        cmap="coolwarm",
        center = 0,
        vmin = vmin,
        vmax = vmax,
        square=True,
        ).tick_params(
            axis='both',
            which='major',
            labelsize=11,
            labelbottom = labelbottom,
            labelright = labelright, 
            bottom=False, 
            top = False, 
            labeltop=True)
    
    plt.yticks(rotation=0) 
    if save != None:
        plt.savefig(save)
    return hm

In [3]:
semiotic = slg.Cenematic("abacus")

SLG [I]: Checking config correctness... Config correct!
SLG [I]: Dataset loaded from disk (dataset file)
SLG [I]: Vocabulary loaded from disk


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at models/abacus/paradigms/tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


SLG [I]: Paradigmatizer loaded from disk


Normalize Corpus

In [4]:
normalizer = slg.syntagmatic.tokenizer.Normalize_menoSLG.normalize_str

In [5]:
norm_vocab_encode = defaultdict(int)
i = 0
for key, val in semiotic.vocab.encode.items():
    key = normalizer(key)
    if key not in norm_vocab_encode:
        norm_vocab_encode[key] = i
        i+=1

Build Term-Context Matrix

In [36]:
context_size = 5 # n° of words to each side (L&R)
freq_counter = defaultdict(int)

for passage in tqdm(semiotic.corpus.train["text"]):
    passage = normalizer(passage)
    word_list = [""]*context_size + passage.split() + [""]*context_size
    for context in zip(*[word_list[i:] for i in range(context_size*2+1)]):
        row_i = norm_vocab_encode[context[context_size]]
        for c in context[:context_size]+context[context_size+1:]:
            if c != "":
                col_i = norm_vocab_encode[c]
                freq_counter[(row_i,col_i)] += 1


  0%|          | 0/21780 [00:00<?, ?it/s]

In [37]:
row_indxs = []
col_indxs = []
dat_vals = []
for (row,col),value in freq_counter.items():
    row_indxs.append(row)
    col_indxs.append(col)
    dat_vals.append(value)

M_freq = csr_matrix((dat_vals, (row_indxs, col_indxs)),shape=(semiotic.vocab.len,semiotic.vocab.len))
M_pmi = slg.util.pmi(M_freq, type_pmi="npmi")

Done


Build focus term corpus and matrix

In [38]:
focus_term = "meno"

focus_corpus = semiotic.corpus.train.filter(lambda paragraph: focus_term in paragraph["text"])
source_encode = {s:i for i,s in enumerate(sorted(list(set(focus_corpus["source"]))))}
source_decode = {i:s for i,s in enumerate(sorted(list(set(focus_corpus["source"]))))}

focus_counter = defaultdict(int)
focus_collocations = []
focus_sources = []
i=0
for passage in tqdm(focus_corpus):
    word_list = [""]*context_size + normalizer(passage["text"]).split() + [""]*context_size
    for context in zip(*[word_list[i:] for i in range(context_size*2+1)]):
        if context[context_size] == focus_term:
            focus_collocations.append(" ".join(context))
            focus_sources.append(source_encode[passage["source"]])
            row_i = i
            for c in context[:context_size]+context[context_size+1:]:
                if c != "":
                    col_i = norm_vocab_encode[c]
                    focus_counter[(row_i,col_i)] += 1
            i += 1
row_indxs = []
col_indxs = []
dat_vals = []
for (row,col),value in focus_counter.items():
    row_indxs.append(row)
    col_indxs.append(col)
    dat_vals.append(value)

M_focus = csr_matrix((dat_vals, (row_indxs, col_indxs)),shape=(i,semiotic.vocab.len))
M_focus_pmi = slg.util.pmi(M_focus, type_pmi="npmi")

  0%|          | 0/1960 [00:00<?, ?it/s]

Done


In [39]:
# slg.util.save_file(focus_collocations, semiotic.paths.corpus / "meno_collocations.txt")

Cluster focus matrix

In [40]:
n_cluster = 5
kmeans = KMeans(n_clusters=n_cluster, random_state=0, verbose = 0).fit(M_focus)

In [41]:
kmeans_pmi = slg.util.pmi(kmeans.cluster_centers_, type_pmi="npmi")

Done


In [42]:
methods = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
method = "ward"

row_n = 1400
col_n = 1400
spec_token_len = len(semiotic.vocab.config.special_tokens)

matrix = M_pmi[spec_token_len:row_n,spec_token_len:col_n].toarray()
elements = list(norm_vocab_encode.keys())[spec_token_len:col_n]

meno_vector = M_freq[norm_vocab_encode["meno"]]
# elements_col = [list(semiotic.vocab.freq.keys())[i] for i in meno_vector.indices]
elements_col = elements
elements_row = elements
elements_row_dict = {e:i for i,e in enumerate(elements_row)}
# matrix = M_pmi[:,meno_vector.indices][spec_token_len:row_n].toarray()


row_linkage = hierarchy.linkage(
    distance.pdist(matrix), method=method)
col_linkage = hierarchy.linkage(
    distance.pdist(matrix.T), method=method)

linkage  = col_linkage

In [43]:
truncate_label = 200
def llf(R, id):
    n = len(elements)
    if id < n:
        return elements[id]
    else:
        label = [id]
        while max(label)>=n:
            new_label = []
            for l in label:
                if l<n:
                    new_label.append(l)
                else:
                    new_label.append(int(R[l-n,0]))
                    new_label.append(int(R[l-n,1]))
            label = new_label

        label = "{ " + " ".join([elements[l] if (i+1)%3>0 else "\n"+elements[l] for i,l in enumerate(label[:truncate_label])]) + ("..." if len(label)>truncate_label else "") + " }"
        return label

fig = plt.figure(figsize=(25, 5))
plt.rcParams.update({'axes.facecolor':'white'})
# plt.rcdefaults()
@interact(p=(1,min(20,len(elements)),1), truncate_mode = ["lastp", "level"], save = False)
def h_clus_cols(p=min(20,len(elements)),truncate_mode = "lastp", save = False):
    plt.figure(figsize=(25, 5))
    hierarchy.dendrogram(
        linkage, #col_linkage,
        labels=elements,
        p = p,
        color_threshold = 3,
        truncate_mode = truncate_mode, #"lastp", #'lastp', None, "level"
        leaf_label_func=partial(llf, linkage),
        leaf_rotation=0,
        get_leaves = True
        );
    plt.grid(b=True, which='major', axis='y',color='lightgrey', linestyle='-', linewidth=.1)
    # plt.xlabel('Clusters',fontsize=16)
    plt.ylabel(f'distance ({method})',fontsize=16)
    plt.xticks(fontfamily="monospace",fontsize=10),
    if save:
        plt.savefig(path_media + "dendrogram.pdf");

<Figure size 1800x360 with 0 Axes>

interactive(children=(IntSlider(value=20, description='p', max=20, min=1), Dropdown(description='truncate_mode…

In [44]:
def format_labels(labels, truncate=None, width = 3):
    if isinstance(truncate,int) and len(labels)>truncate:
        labels = (labels[:truncate])+["..."]
    labels = "{" + "|".join([l if (i+1)%width>0 else "\n"+l for i,l in enumerate(labels)]) + "}"
    return labels

In [45]:
def type_matrix(row_thres=5, col_thres=5, concepts="meno",separate = False, focus_cluster = False):

    cluster_labels_rows = hierarchy.fcluster(row_linkage, row_thres, criterion='maxclust')
    cluster_labels_cols = hierarchy.fcluster(col_linkage, col_thres, criterion='maxclust')

    max_cluster_rows = max(cluster_labels_rows)
    max_cluster_cols = max(cluster_labels_cols)

    # This should be simpler without dict, calling elements of lists
    row_dict = {row:np.argwhere(cluster_labels_rows==row) for row in range(1,max_cluster_rows+1)}
    col_dict = {col:np.argwhere(cluster_labels_cols==col).flatten() for col in range(1,max_cluster_cols+1)}
    
    if concepts!=None:
        concepts = concepts.split()
        if separate:
            i = 1
            for c in concepts:
                if c in elements_row:
                    row_dict[row_thres+i]= np.array([[elements_row_dict.get(c)]]).T
                    i += 1
        
        else:
            row_dict[row_thres+1] = np.array([[elements_row_dict.get(c) for c in concepts if c in elements_row]]).T




    label_row = [np.array(elements_row)[row_dict[row].flatten()].tolist() for row in range(1,len(row_dict)+1)]
    label_row = [format_labels(row,10,4) for row in label_row]

    label_col = [np.array(elements_col)[col_dict[col]].tolist() for col in range(1,max_cluster_cols+1)]
    label_col = [format_labels(row,10,1) for row in label_col]


    type_m = np.zeros(shape=(len(row_dict), col_thres))
    for r in range(len(row_dict)):
        for c in range(col_thres):
            type_m[r,c] = M_pmi[row_dict[r+1],col_dict[c+1]].mean()

    if focus_cluster:
        type_m = np.vstack((type_m,kmeans_pmi[:,:type_m.shape[1]].toarray()))
        label_row = label_row + [f"{focus_term}_{i+1}" for i in range(kmeans_pmi.shape[0])]

            
    sns.set(
        rc = {'figure.figsize':(40,20)},
        font="Courier"
        )
    matrix_plot = plot(type_m,label_col,label_row)
    return matrix_plot

interact(type_matrix, row_thres=(2, 100, 1), col_thres=(2,100,1), concepts=widgets.Text("meno",description="Extra Cluster", continuous_update=False));

interactive(children=(IntSlider(value=5, description='row_thres', min=2), IntSlider(value=5, description='col_…

In [32]:
focus_corpus[100]["text"]


'Quando li cubi, li censi, le cose sono equali al numero, se vole partire le cose per li censi e quello che ne vene recare a ra-dici cuba c ponare sopra il numero; et radici cuba di quello meno il partimento che venne de le cose nelli censi, vale la cosa.'

In [33]:
normalizer(focus_corpus[1000]["text"])

'famme quista ragione uno me dèie dare d em 5 pagamenta divisate luno da laltro sì cho si chontiene qui da pièie la quale ragione vuole redure a uno termene o a doie termene o a doie pagamenta sì che sen stan luno laltro a dericto e a ponto devemo recevere da uno dì 9 esfiento octovre lib 1317 anche devemo recevere a dì 9 esfiente octovre lib 628 anche devemo avere a mego novenbre lib 293 anche devemo entrante decenbre lib 979 anche devemo avere em kl de genaio lib 2594 somma lib 5811 prima dèie avere quillo che remane en eli mego 1 d 217463 quista è la deritta regola chomo se dèie fare quista ragione e tutte le semeglante ragione che se podessero dire de più e de meno che noie devemo fare la somma de tutto el chapetale de la quale deveraie essere partedore la quale somma se dèie escrivere sì chomo sta qui de sopra e da che noie avemo chusì fatto sì devemo multiplichare tutte le lib de la prima ragione chom gle dì del primo termene em fine a laltro termene che sonno mese 7 e dì 1 più g

In [34]:
focus_collocations[:2]

['1698 meno dr', 'tante meno de']

In [35]:
focus_collocations

['1698 meno dr',
 'tante meno de',
 'quante meno de',
 '10 meno uno',
 'censo meno ',
 'io meno 1',
 'cotanto meno 100',
 'numero meno 32',
 'sechondo meno 9',
 '600 meno 20',
 '600 meno 20',
 'cosa meno 20',
 '1200 meno 20',
 'censi meno radice',
 'quante meno parti',
 'è meno che',
 'censo meno una',
 'io meno ix',
 '10 meno ix',
 '2400 meno 40',
 '2400 meno 40',
 '60 meno radici',
 '5400 meno 60',
 '5400 meno 60',
 'go meno radici',
 '2400 meno 40',
 '60 meno radici',
 '5400 meno 60',
 'go meno radici',
 'uno meno non',
 'somma meno laltra',
 '6 meno t',
 'io meno 2',
 'censi meno una',
 'cosa meno 1',
 'qualè meno e',
 'sono meno di',
 'chosa meno ed',
 'chosa meno ed',
 'cosa meno il',
 '20 meno la',
 '20 meno la',
 '84 meno una',
 'chosa meno ',
 '32 meno radice',
 'balle meno 3',
 '14 meno una',
 '19 meno 1',
 '19 meno 1',
 '95 meno 5',
 '95 meno 2',
 '95 meno 2',
 'chosa meno in',
 'faccino meno 4',
 'è meno 12',
 'è meno 12',
 'chosa meno fanno',
 '7 meno 12',
 'chosa meno et'