In [1]:
from tqdm.notebook import tqdm
import datasets

import semiolog as slg

In [2]:
semiotic = slg.Cenematic("abacus")

SLG [I]: Checking config correctness... Config correct!
SLG [I]: Dataset loaded from disk (dataset file)
SLG [I]: Vocabulary loaded from disk


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at models/abacus/paradigms/tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


SLG [I]: Paradigmatizer loaded from disk


In [3]:
meno_corpus = semiotic.corpus.train.filter(lambda paragraph: "meno" in paragraph["text"])

In [None]:
meno_paradigms = []
meno_sources = []
for passage in tqdm(meno_corpus):
    source = passage["source"]
    text = semiotic(passage["text"], paradigms=False)
    for i,token in enumerate(text.chain.tokens[:512]):
        if token.label == "meno":
            meno_parad = semiotic.paradigmatic.paradigmatizer(text.chain, i)
            meno_paradigms.append(meno_parad)
            meno_sources.append(source)

In [20]:
from datasets import Dataset

meno_parad_dataset = Dataset.from_dict(
    {
        "ids": [parad.ids for parad in meno_paradigms],
        "keys": [parad.keys for parad in meno_paradigms],
        "probs": [parad.probs for parad in meno_paradigms],
        "values": [parad.values for parad in meno_paradigms],
        "sources": meno_sources
    }
)

In [24]:
meno_parad_dataset.save_to_disk(semiotic.paths.semiotic / "analysis/meno_parads")

In [8]:
meno_parad_dataset = datasets.load_from_disk(semiotic.paths.semiotic / "analysis/meno_parads")

In [9]:
from scipy.sparse import lil_matrix
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import Counter

In [10]:
X = lil_matrix((meno_parad_dataset.num_rows,semiotic.vocab.len))
for row, (columns,values) in tqdm(enumerate(zip(meno_parad_dataset["ids"],meno_parad_dataset["values"]))):
    indexes_i = [int(i) for i in columns]
    values_i = [float(i) for i in values]
    for i,v in zip(indexes_i,values_i):
        X[row,i]=v     

0it [00:00, ?it/s]

In [17]:
X

<5415x69257 sparse matrix of type '<class 'numpy.float64'>'
	with 2010926 stored elements in List of Lists format>

In [23]:
X = X[:row+1,:semiotic.vocab.len+1].tocsr()

#Trim non-zeroes rows and columns from X
trim_X = False
if trim_X:
    for n in range(1,M.shape[0]):
        if X[-n].nnz!=0:
            max_row = M.shape[0]-n
            break
    for n in range(1,M.shape[1]):
        if X[:,-n].nnz!=0:
            max_col = M.shape[1]-n
            break
        
    print(f"Trimmed Size: {max_row,max_col}")

    X = X[:max_row+1,:max_col+1]

In [49]:
n_cluster = 2
kmeans = KMeans(n_clusters=n_cluster, random_state=0, verbose = 1).fit(X)

Initialization complete
Iteration 0, inertia 24.74780194541405.
Iteration 1, inertia 13.095099484662768.
Iteration 2, inertia 12.946315323470362.
Iteration 3, inertia 12.926670429423371.
Iteration 4, inertia 12.92268211722507.
Iteration 5, inertia 12.921148246925613.
Iteration 6, inertia 12.92070682132545.
Iteration 7, inertia 12.920592342988591.
Iteration 8, inertia 12.920526793888033.
Iteration 9, inertia 12.920503719829943.
Iteration 10, inertia 12.920489980542849.
Iteration 11, inertia 12.920450456811292.
Iteration 12, inertia 12.920441684636383.
Converged at iteration 12: strict convergence.
Initialization complete
Iteration 0, inertia 22.791632837484787.
Iteration 1, inertia 13.04738777435022.
Iteration 2, inertia 12.933247522774543.
Iteration 3, inertia 12.922367917390249.
Iteration 4, inertia 12.920961702600192.
Iteration 5, inertia 12.920641175180924.
Iteration 6, inertia 12.92051994290873.
Iteration 7, inertia 12.920476747416034.
Iteration 8, inertia 12.92046672783536.
Iterat

In [27]:
cluster_labels = []
cluster_values = []
cluster_ids = []

for cluster in kmeans.cluster_centers_:
    c_indeces = np.nonzero(cluster)[0]
    c_values = cluster[c_indeces]
    c_order = list(zip(c_values,c_indeces))
    cluster_ids.append([i for v,i in sorted(c_order,reverse=True)])
    cluster_labels.append([semiotic.vocab.decode[i] for v,i in sorted(c_order,reverse=True)[:40]])
    cluster_values.append(sorted(c_values, reverse=True))

In [28]:
cluster_labels = [[semiotic.vocab.decode[id] for id in cluster[:40]] for cluster in cluster_ids]

In [29]:
slg.util.df(cluster_labels)

Unnamed: 0,0,1
0,meno,meno
1,più,e
2,e,più
3,",",","
4,che,via
5,per,et
6,di,per
7,a,fa
8,è,è
9,.,di


In [31]:
set_0 = set(cluster_labels[0])
set_1 = set(cluster_labels[1])

In [32]:
set_0.intersection(set_1)

{"'",
 ',',
 '.',
 '/',
 ':',
 ';',
 'a',
 'che',
 'cioè',
 'da',
 'de',
 'di',
 'dj',
 'e',
 'et',
 'fa',
 'in',
 'la',
 'meno',
 'numero',
 'per',
 'più',
 'resta',
 'sono',
 'vale',
 'via',
 'è',
 '’'}

In [33]:
set_1-set_0

{'c',
 'censi',
 'chose',
 'con',
 'fanno',
 'farà',
 'fu',
 'overo',
 'parte',
 'rimane',
 'sia',
 'vie'}

In [34]:
set_0-set_1

{'1', '10', '12', '2', '20', '3', '4', '5', '6', '7', 'del', 'sopra'}

In [35]:
cluster_0 = Counter([source for source, cluster in zip(meno_parad_dataset["sources"], kmeans.labels_) if cluster == 0])
cluster_1 = Counter([source for source, cluster in zip(meno_parad_dataset["sources"], kmeans.labels_) if cluster == 1])

In [38]:
cluster_0.most_common()

[('Franci_R_2001_Dardi-Aliabraa argibra_S', 449),
 ("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 312),
 ("Arrighi_G_1970_Piero-Trattato d'abbaco_S", 225),
 ('Simi_A_1994_Alcibra amuchabile_S', 183),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  100),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 82),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  79),
 ('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 73),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 71),
 ('Simi_A_1992_Anonimo fiorentino_Regole di geometria e della cosa_S', 69),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  66),
 ('Gino Arrighi - Maestro Umbro (sec. XIII), LIVERO DE L’ABBECHO (1987)', 65),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 63),
 ("Arrighi_G_1989{a}_Livero de l'abbecho_S", 54),
 ('Gre

In [39]:
cluster_1.most_common()

[("Arrighi_G_1970_Piero-Trattato d'abbaco_S", 742),
 ('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 701),
 ('Franci_R_2001_Dardi-Aliabraa argibra_S', 570),
 ("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 417),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  233),
 ("Franci_R_1983{a}_Gilio_Questioni d'algebra_S", 133),
 ('Arrighi_G_1967{c}_Trascelta di Giovanni di Bartolo_S', 98),
 ('Simi_A_1994_Alcibra amuchabile_S', 97),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 83),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  54),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 39),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  36),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 34),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 30),
 ("Arrighi_G_198

In [40]:
Counter(kmeans.labels_)

Counter({0: 2100, 1: 3315})

In [52]:
X_c1 = X[kmeans.labels_==1,:]
X_c1

In [55]:
n_cluster = 2
kmeans_c1 = KMeans(n_clusters=n_cluster, random_state=0, verbose = 1).fit(X_c1)

Initialization complete
Iteration 0, inertia 11.7614729529003.
Iteration 1, inertia 6.730553890138759.
Iteration 2, inertia 6.696655170902777.
Iteration 3, inertia 6.687819613592723.
Iteration 4, inertia 6.680380246857858.
Iteration 5, inertia 6.6773699175908625.
Iteration 6, inertia 6.677224856688383.
Iteration 7, inertia 6.677220337667631.
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 13.365438548975982.
Iteration 1, inertia 6.370856283732304.
Iteration 2, inertia 6.225094318274216.
Iteration 3, inertia 6.187839608214192.
Iteration 4, inertia 6.180719308667806.
Iteration 5, inertia 6.177077484021982.
Iteration 6, inertia 6.175396600123566.
Iteration 7, inertia 6.17477998379796.
Iteration 8, inertia 6.174326817193588.
Iteration 9, inertia 6.174082301350686.
Iteration 10, inertia 6.1738636888004885.
Iteration 11, inertia 6.173835618522888.
Iteration 12, inertia 6.173831979576689.
Converged at iteration 12: strict convergence.
Initialization 

In [59]:
c_1_labels = []
c_1_values = []
c_1_ids = []

for cluster in kmeans_c1.cluster_centers_:
    c1_indeces = np.nonzero(cluster)[0]
    c1_values = cluster[c1_indeces]
    c1_order = list(zip(c1_values,c1_indeces))
    c_1_ids.append([i for v,i in sorted(c1_order,reverse=True)])
    c_1_labels.append([semiotic.vocab.decode[i] for v,i in sorted(c1_order,reverse=True)[:40]])
    c_1_values.append(sorted(c1_values, reverse=True))

In [60]:
slg.util.df(c_1_labels)

Unnamed: 0,0,1
0,meno,meno
1,e,e
2,più,più
3,",",fa
4,via,via
5,et,et
6,per,","
7,di,resta
8,è,è
9,a,per


In [61]:
set_10 = set(c_1_labels[0])
set_11 = set(c_1_labels[1])

In [62]:
set_10.intersection(set_11)

{',',
 '.',
 '/',
 ':',
 ';',
 'a',
 'che',
 'cioè',
 'con',
 'de',
 'di',
 'e',
 'et',
 'fa',
 'farà',
 'in',
 'meno',
 'per',
 'più',
 'resta',
 'sia',
 'sono',
 'vale',
 'via',
 'è'}

In [63]:
set_11-set_10

{'ad',
 'c',
 'cosa',
 'cose',
 'dà',
 'equale',
 'montiplica',
 'numero',
 'remane',
 'sirano',
 'sirà',
 'vaglano',
 'vene',
 '’',
 '□'}

In [64]:
set_10-set_11

{"'",
 'censi',
 'chon',
 'chosa',
 'chose',
 'da',
 'dj',
 'fanno',
 'la',
 'overo',
 'parte',
 'rimane',
 'vie',
 'viene',
 'vienne'}

In [84]:

cluster_10_dataset = [meno_parad_dataset["sources"][i] for i in np.where(kmeans.labels_==0)[0]]
cluster_10 = Counter([source for source, cluster in zip(cluster_10_dataset, kmeans_c1.labels_) if cluster == 0])


cluster_11_dataset = [meno_parad_dataset["sources"][i] for i in np.where(kmeans.labels_==1)[0]]
cluster_11 = Counter([source for source, cluster in zip(cluster_11_dataset, kmeans_c1.labels_) if cluster == 1])

In [86]:
cluster_10.most_common()

[('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 701),
 ('Franci_R_2001_Dardi-Aliabraa argibra_S', 570),
 ("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 417),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  233),
 ("Franci_R_1983{a}_Gilio_Questioni d'algebra_S", 118),
 ('Arrighi_G_1967{c}_Trascelta di Giovanni di Bartolo_S', 98),
 ('Simi_A_1994_Alcibra amuchabile_S', 97),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 83),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 37),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  34),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 32),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 29),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Culture (Science Networks. Historical Studies) (2007)",
  21),
 ("Franci_R_2015_Un trattato d'abaco pisano della fine del XIII seco

In [87]:
cluster_11.most_common()

[("Franci_R_1988{c}_&Pancanti_Trattato d'algibra_S", 134),
 ('Franci_R_2001_Dardi-Aliabraa argibra_S', 130),
 ("Arrighi_G_1970_Piero-Trattato d'abbaco_S", 129),
 ('Arrighi_G_1967{a}_Mazzingi-fioretti_S', 107),
 ("Pieraccini_L_1983_Maestro Biagio_Chasi exemplari alla regola dell'argibra_S",
  55),
 ('Simi_A_1994_Alcibra amuchabile_S', 34),
 ('Arrighi_G_1967{c}_Trascelta di Giovanni di Bartolo_S', 30),
 ("Arrighi_G_1973_Scuola lucchese--LIbro d'abaco_S", 28),
 ("Franci_R_1983{a}_Gilio_Questioni d'algebra_S", 27),
 ('Paolo Gherardi, Gino Arrighi - OPERA MATEMATICA - Libro ài ragioni - Liber habaci-Maria Pacini Pazzi (1987)',
  25),
 ('Arrighi_G_1987{f}_Gherardi+Liber habaci_S', 17),
 ("Arrighi_G_1974_P-M-Calandri[Benedetto]_Tractato-d'abbacho_S", 15),
 ('Gino Arrighi - Maestro Umbro (sec. XIII), LIVERO DE L’ABBECHO (1987)', 13),
 ('Gregori_S_&Grugnetti_1998__Anonimo_Libro di conti e mercatanzie_S', 13),
 ("Jens Hoyrup - Jacopo da Firenze's Tractatus Algorismi and Early Italian Abbacus Cu