In [1]:
from tqdm.notebook import tqdm
import datasets

import semiolog as slg

In [2]:
semiotic = slg.Cenematic("abacus")

SLG [I]: Checking config correctness... Config correct!
SLG [I]: Dataset loaded from disk (dataset file)
SLG [I]: Vocabulary loaded from disk


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at models/abacus/paradigms/tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


SLG [I]: Paradigmatizer loaded from disk


In [3]:
meno_corpus = semiotic.corpus.train.filter(lambda paragraph: "rotto" in paragraph["text"] or "rotti" in paragraph["text"])

100%|██████████| 22/22 [00:00<00:00, 72.28ba/s]


In [53]:
meno_paradigms = []
meno_sources = []
meno_text = []
meno_label = []
for passage in tqdm(meno_corpus):
    source = passage["source"]
    text = semiotic(passage["text"], paradigms=False)
    for i,token in enumerate(text.chain.tokens[:512]):
        if token.label == "rotto" or token.label == "rotti":
            meno_parad = semiotic.paradigmatic.paradigmatizer(text.chain, i)
            meno_paradigms.append(meno_parad)
            meno_sources.append(source)
            meno_text.append(text.chain)
            meno_label.append(token.label)

  0%|          | 0/307 [00:00<?, ?it/s]

In [59]:
from datasets import Dataset

meno_parad_dataset = Dataset.from_dict(
    {
        "ids": [parad.ids for parad in meno_paradigms],
        "keys": [parad.keys for parad in meno_paradigms],
        "probs": [parad.probs for parad in meno_paradigms],
        "values": [parad.values for parad in meno_paradigms],
        "sources": meno_sources,
        "text": [i.input for i in meno_text],
        "label": meno_label,
    }
)

In [60]:
meno_parad_dataset.save_to_disk(semiotic.paths.semiotic / "analysis/rotto_parads")

In [61]:
meno_parad_dataset = datasets.load_from_disk(semiotic.paths.semiotic / "analysis/rotto_parads")

In [62]:
from scipy.sparse import lil_matrix
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import Counter

In [63]:
X = lil_matrix((meno_parad_dataset.num_rows,semiotic.vocab.len))
for row, (columns,values) in tqdm(enumerate(zip(meno_parad_dataset["ids"],meno_parad_dataset["values"]))):
    indexes_i = [int(i) for i in columns]
    values_i = [float(i) for i in values]
    for i,v in zip(indexes_i,values_i):
        X[row,i]=v     

0it [00:00, ?it/s]

In [64]:
X

<473x69257 sparse matrix of type '<class 'numpy.float64'>'
	with 291191 stored elements in List of Lists format>

In [65]:
X = X[:row+1,:semiotic.vocab.len+1].tocsr()

#Trim non-zeroes rows and columns from X
trim_X = False
if trim_X:
    for n in range(1,M.shape[0]):
        if X[-n].nnz!=0:
            max_row = M.shape[0]-n
            break
    for n in range(1,M.shape[1]):
        if X[:,-n].nnz!=0:
            max_col = M.shape[1]-n
            break
        
    print(f"Trimmed Size: {max_row,max_col}")

    X = X[:max_row+1,:max_col+1]

In [66]:
n_cluster = 2
kmeans = KMeans(n_clusters=n_cluster, random_state=0, verbose = 1).fit(X)

Initialization complete
Iteration 0, inertia 1.6040814745861.
Iteration 1, inertia 1.020836137475369.
Iteration 2, inertia 1.0137977255979655.
Iteration 3, inertia 1.0074158807560503.
Iteration 4, inertia 0.9971323840960161.
Iteration 5, inertia 0.9879104874459728.
Iteration 6, inertia 0.9842695658471884.
Iteration 7, inertia 0.9814137464171574.
Iteration 8, inertia 0.9772371502930655.
Iteration 9, inertia 0.9724049546911449.
Iteration 10, inertia 0.9668273476189755.
Iteration 11, inertia 0.9650884659293724.
Iteration 12, inertia 0.964696878949511.
Iteration 13, inertia 0.9646048327222075.
Iteration 14, inertia 0.9644732910362881.
Iteration 15, inertia 0.9644084673234162.
Iteration 16, inertia 0.9643674328694589.
Iteration 17, inertia 0.9643430686508505.
Converged at iteration 17: strict convergence.
Initialization complete
Iteration 0, inertia 1.7548884006824936.
Iteration 1, inertia 1.0092553720222952.
Iteration 2, inertia 1.0004615180314007.
Iteration 3, inertia 0.9902713708224482.


In [67]:
cluster_labels = []
cluster_values = []
cluster_ids = []

for cluster in kmeans.cluster_centers_:
    c_indeces = np.nonzero(cluster)[0]
    c_values = cluster[c_indeces]
    c_order = list(zip(c_values,c_indeces))
    cluster_ids.append([i for v,i in sorted(c_order,reverse=True)])
    cluster_labels.append([semiotic.vocab.decode[i] for v,i in sorted(c_order,reverse=True)[:40]])
    cluster_values.append(sorted(c_values, reverse=True))

In [None]:
rotto_parad_dataset =rotto_parad_dataset

In [27]:
cluster_labels = [[semiotic.vocab.decode[id] for id in cluster[:40]] for cluster in cluster_ids]

In [81]:
rotto_parad_dataset[0]["text"]

'1014\tAbiamo dicto del giongimento et del sobtraemento de numeri rotti. Ora diremo che parte è l’uno numero dell’altro. Et primamente diremo cosi, quanto è 2 de 2. Fa cosi, corno tu vidi, 2 se scrive, et ponse uno de sopre et tre de sotto, et una verga (fol. 16v) (inV mezzo, et 2 se scrive 5 de sopre et el 7 de sotto, et una vergha in mezzo. Ora multiplica le parti de sopre, cioè l’uno et el cinque, l’uno contra all’al-tro, che fa 1 via 5, fa 5. Et similmente multiplica le parti de sotto, cioè el 3 contra el 7, che fa 21. Et poi poni 5 de sopre a 21, et fa una vergha in mezzo, et serà 2.. Et 2. dirremo che serà el terzo de 2, et sta 21\t7'

In [28]:
slg.util.df(cluster_labels)

Unnamed: 0,0,1
0,rotto,rotti
1,sano,","
2,numero,che
3,rotti,e
4,partire,2
5,rocto,3
6,chente,numero
7,",",1
8,e,.
9,partitore,5


In [29]:
set_0 = set(cluster_labels[0])
set_1 = set(cluster_labels[1])

In [30]:
set_0.intersection(set_1)

{',',
 '2',
 'che',
 'e',
 'in',
 'meno',
 'modo',
 'numeri',
 'numero',
 'parte',
 'partire',
 'per',
 'più',
 'rotti',
 'rotto',
 'sopra',
 'uno'}

In [31]:
set_1-set_0

{"'",
 '.',
 '/',
 '1',
 '10',
 '11',
 '12',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 'a',
 'come',
 'di',
 'et',
 'fare',
 'insieme',
 'si',
 'sono',
 'è',
 '’'}

In [33]:
set_0-set_1

{'chente',
 'contiene',
 'contra',
 'contro',
 'dinanzi',
 'dividere',
 'intero',
 'multiplicare',
 'multiplicato',
 'multipricare',
 'numerj',
 'partito',
 'partitore',
 'quanto',
 'rechare',
 'rimanente',
 'rocto',
 'rottj',
 'sano',
 'ssano',
 'tal',
 'tale',
 'vergha'}

In [39]:
X

<473x69257 sparse matrix of type '<class 'numpy.float64'>'
	with 291191 stored elements in Compressed Sparse Row format>

In [68]:
kmeans.labels_

array([1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,

In [78]:
meno_parad_dataset[0]

{'ids': [1128,
  231,
  524,
  176,
  201,
  156,
  723,
  16,
  935,
  18,
  587,
  22,
  236,
  25,
  272,
  308,
  1860,
  4562,
  2050,
  332,
  44,
  849,
  561,
  1957,
  3129,
  4437,
  1437,
  888,
  197,
  42,
  4692,
  2702,
  6,
  993,
  7503,
  14,
  2085,
  988,
  3756,
  4957,
  60,
  3874,
  94,
  3308,
  10777,
  1045,
  948,
  112,
  125,
  1148,
  1110,
  13,
  916,
  1117,
  341,
  698,
  261,
  4724,
  77,
  4741,
  889,
  1082,
  2477,
  11,
  173,
  1362,
  644,
  5074,
  11153,
  4142,
  1839,
  3160,
  12935,
  5257,
  1585,
  1370,
  502,
  1222,
  169,
  1345,
  1041,
  10505,
  5694,
  2786,
  206,
  7709,
  144,
  954,
  356,
  5,
  46,
  10843,
  3229,
  3660,
  213,
  97,
  9204,
  837,
  7032,
  10363,
  1197,
  2913,
  228,
  3850,
  182,
  7829,
  3219,
  957,
  1632,
  9385,
  5977,
  6356,
  422,
  82,
  470,
  2623,
  120,
  792,
  832,
  1084,
  2303,
  215,
  797,
  323,
  3033,
  3303,
  59,
  1400,
  1636,
  3661,
  4272,
  2955,
  2233,
  8363,


In [34]:
cluster_0 = Counter([source for source, cluster in zip(meno_parad_dataset["sources"], kmeans.labels_) if cluster == 0])
cluster_1 = Counter([source for source, cluster in zip(meno_parad_dataset["sources"], kmeans.labels_) if cluster == 1])

In [35]:
cluster_0.most_common()

[('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 40),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  38),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 33),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 27),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  22),
 ('Simi_A_1995_Jacopo-Riccardiana 2236_S', 14),
 ("Arrighi_G_1966{b}_Paolo dell'Abbaco_Regoluzze_S", 7),
 ('Arrighi_G_1964_(ps)-Paolo_S', 5),
 ('Simi_A_1994_Alcibra amuchabile_S', 2),
 ("Franci_R_2015_Un trattato d'abaco pisano della fine del XIII secolo_S_OK",
  1),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 1)]

In [36]:
cluster_1.most_common()

[("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 43),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  37),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 32),
 ("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 26),
 ('Simi_A_1995_Jacopo-Riccardiana 2236_S', 17),
 ('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 15),
 ("Franci_R_2015_Un trattato d'abaco pisano della fine del XIII secolo_S_OK",
  15),
 ("Franci_R_2015_Un trattato d'abaco pisano della fine del XIII secolo (MS)_Working copy",
  14),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 13),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  13),
 ('Arrighi_G_1964_(ps)-Paolo_S', 11),
 ('Arrighi_G_1969_Filippo Calandri_Trattato di Aritmetica_S', 8),
 ("Arrighi_G_1966{b}_Paolo dell'Abbaco_Regoluzze_S", 7),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài r

In [37]:
Counter(kmeans.labels_)

Counter({1: 283, 0: 190})

In [52]:
X_c1 = X[kmeans.labels_==1,:]
X_c1

In [55]:
n_cluster = 2
kmeans_c1 = KMeans(n_clusters=n_cluster, random_state=0, verbose = 1).fit(X_c1)

Initialization complete
Iteration 0, inertia 11.7614729529003.
Iteration 1, inertia 6.730553890138759.
Iteration 2, inertia 6.696655170902777.
Iteration 3, inertia 6.687819613592723.
Iteration 4, inertia 6.680380246857858.
Iteration 5, inertia 6.6773699175908625.
Iteration 6, inertia 6.677224856688383.
Iteration 7, inertia 6.677220337667631.
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 13.365438548975982.
Iteration 1, inertia 6.370856283732304.
Iteration 2, inertia 6.225094318274216.
Iteration 3, inertia 6.187839608214192.
Iteration 4, inertia 6.180719308667806.
Iteration 5, inertia 6.177077484021982.
Iteration 6, inertia 6.175396600123566.
Iteration 7, inertia 6.17477998379796.
Iteration 8, inertia 6.174326817193588.
Iteration 9, inertia 6.174082301350686.
Iteration 10, inertia 6.1738636888004885.
Iteration 11, inertia 6.173835618522888.
Iteration 12, inertia 6.173831979576689.
Converged at iteration 12: strict convergence.
Initialization 

In [59]:
c_1_labels = []
c_1_values = []
c_1_ids = []

for cluster in kmeans_c1.cluster_centers_:
    c1_indeces = np.nonzero(cluster)[0]
    c1_values = cluster[c1_indeces]
    c1_order = list(zip(c1_values,c1_indeces))
    c_1_ids.append([i for v,i in sorted(c1_order,reverse=True)])
    c_1_labels.append([semiotic.vocab.decode[i] for v,i in sorted(c1_order,reverse=True)[:40]])
    c_1_values.append(sorted(c1_values, reverse=True))

In [60]:
slg.util.df(c_1_labels)

Unnamed: 0,0,1
0,meno,meno
1,e,e
2,più,più
3,",",fa
4,via,via
5,et,et
6,per,","
7,di,resta
8,è,è
9,a,per


In [61]:
set_10 = set(c_1_labels[0])
set_11 = set(c_1_labels[1])

In [62]:
set_10.intersection(set_11)

{',',
 '.',
 '/',
 ':',
 ';',
 'a',
 'che',
 'cioè',
 'con',
 'de',
 'di',
 'e',
 'et',
 'fa',
 'farà',
 'in',
 'meno',
 'per',
 'più',
 'resta',
 'sia',
 'sono',
 'vale',
 'via',
 'è'}

In [63]:
set_11-set_10

{'ad',
 'c',
 'cosa',
 'cose',
 'dà',
 'equale',
 'montiplica',
 'numero',
 'remane',
 'sirano',
 'sirà',
 'vaglano',
 'vene',
 '’',
 '□'}

In [64]:
set_10-set_11

{"'",
 'censi',
 'chon',
 'chosa',
 'chose',
 'da',
 'dj',
 'fanno',
 'la',
 'overo',
 'parte',
 'rimane',
 'vie',
 'viene',
 'vienne'}

In [84]:

cluster_10_dataset = [meno_parad_dataset["sources"][i] for i in np.where(kmeans.labels_==0)[0]]
cluster_10 = Counter([source for source, cluster in zip(cluster_10_dataset, kmeans_c1.labels_) if cluster == 0])


cluster_11_dataset = [meno_parad_dataset["sources"][i] for i in np.where(kmeans.labels_==1)[0]]
cluster_11 = Counter([source for source, cluster in zip(cluster_11_dataset, kmeans_c1.labels_) if cluster == 1])

In [86]:
cluster_10.most_common()

[('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 701),
 ('Franci_R_2001_Dardi-Aliabraa argibra_S', 570),
 ("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 417),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  233),
 ("Franci_R_1983{a}_Gilio_Questioni d'algebra_S", 118),
 ('Arrighi_G_1967{c}_Trascelta di Giovanni di Bartolo_S', 98),
 ('Simi_A_1994_Alcibra amuchabile_S', 97),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 83),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 37),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  34),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 32),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 29),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  21),
 ("Franci_R_2015_Un trattato d'abaco pisano della fine del XIII seco

In [87]:
cluster_11.most_common()

[("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 134),
 ('Franci_R_2001_Dardi-Aliabraa argibra_S', 130),
 ("Arrighi_G_1970_Piero-Trattato d'abbaco_S", 129),
 ('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 107),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  55),
 ('Simi_A_1994_Alcibra amuchabile_S', 34),
 ('Arrighi_G_1967{c}_Trascelta di Giovanni di Bartolo_S', 30),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 28),
 ("Franci_R_1983{a}_Gilio_Questioni d'algebra_S", 27),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  25),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 17),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 15),
 ('Gino Arrighi - Maestro Umbro (sec. XIII), LIVERO DE L’ABBECHO (1987)', 13),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 13),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Cu

In [51]:
meno_corpus

Dataset({
    features: ['text', 'source'],
    num_rows: 307
})