### Basic TSDAE

In [1]:
!pip install -Uqq sentence_transformers

from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

model_name = "bert-base-uncased"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_sentences = [
    "Your set of sentences",
    "Model will automatically add the noise",
    "And re-construct it",
    "You should provide at least 1k sentences",
]

train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
train_loss = losses.DenoisingAutoEncoderLoss(
    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    weight_decay=0,
    scheduler="constantlr",
    optimizer_params={"lr": 3e-5},
)

  from tqdm.autonotebook import tqdm, trange
When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', '

Step,Training Loss


In [2]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "Why is it raining?",
]
embeddings = model.encode(sentences)
print(embeddings.shape)
model.similarity(embeddings, embeddings)

(3, 768)


tensor([[1.0000, 0.9208, 0.8642],
        [0.9208, 1.0000, 0.9015],
        [0.8642, 0.9015, 1.0000]])

### Load NYUAD sentences

In [3]:
!pip3 install -Uqq conllu tabulate
import conllu
from collections import defaultdict
import pandas as pd
with open('dev.conllu', 'r') as file:
    corpus = conllu.parse(file.read())
dbg_sent = corpus[2]
print(dbg_sent.serialize())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# sent_id = 20000715_AFP_ARB.0001:3
# text = وبدا ستيفن كنت نحيلا جدا ، الا انه اغتسل وحلق ذقنه للمرة الاولى منذ فترة لا بد ان تكون طويلة . وبما ان المناسبة تستحق العناء اشترى سروالا ازرق وحذاء جديدين ومعطفا خفيفا ليبدا بها حياته الجديدة .
# text_bw = wbdA styfn knt nHylA jdA , AlA Anh Agtsl wHlq *qnh llmrp AlAwlY mn* ftrp lA bd An tkwn Twylp . wbmA An AlmnAsbp tstHq AlEnA' A$trY srwAlA Azrq wH*A' jdydyn wmETfA xfyfA lybdA bhA HyAth Aljdydp .
1-2	وبدا	_	_	_	_	_	_	_	_
1	و	w	CCONJ	CONJ	_	2	cc	_	bw=wa
2	بدا	badA-u_1	VERB	PV+PVSUFF_SUBJ:3MS	Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Person=3|Voice=Act	0	root	_	bw=badA
3	ستيفن	stiyfin_1	PROPN	NOUN_PROP	Definite=Ind|Gender=Masc|Number=Sing	2	nsubj	_	bw=stiyfin
4	كنت	kinot_1	PROPN	NOUN_PROP	Definite=Ind|Gender=Masc|Number=Sing	3	flat	_	bw=kinot
5	نحيلا	naHiyl_1	ADJ	ADJ+CASE_INDEF_ACC	Case=Acc|Definite=Ind|Gender=Masc|Number=Sing	2	amod	_	bw=naHiylAF
6	جدا	jid~_1	NOUN	NOUN+CASE_INDEF_ACC	Case=Acc|Definite=Ind|Gender=Masc|Number=Sing	5	nmod	_

In [11]:
def query_subtree(node, callback_fn=None, depth=0, parent=None):
    total_size = 1
    total_ids = set([node.token['id']])
    for ch in node.children:
        size, ids = query_subtree(ch, callback_fn, depth+1, node)
        total_size += size
        total_ids = total_ids.union(ids)
    is_conseq = total_size == len(total_ids) and total_size == (max(total_ids) - min(total_ids) + 1)
    if is_conseq and callback_fn is not None:
        callback_fn(total_ids)
    return total_size, total_ids

def sent_text(sent, ids):
    ans = []
    watermark = -1
    for t in sent:
        id = t['id']
        if type(id) is int:
            if id not in ids or id <= watermark:
                continue
            ans.append(t['form'])
        else:
            id0, _, id1 = id
            if id0 not in ids and id1 not in ids:
                continue
            conc = []
            for lil_t in sent.filter(id=lambda x: type(x) is int and x >= id0 and x <= id1 and x in ids):
                conc.append(lil_t['form'])
                watermark = lil_t['id']
            ans.append(''.join(conc))
    return ' '.join(ans)

def extract_sentences_of_len(sent, len_range, output):
    def extract(ids):
        if len(ids) < len_range[0] or len(ids) >= len_range[1]:
            return
        res = sent.filter(id=lambda x: x in ids)
        res.metadata['text'] = sent_text(sent, ids)
        output.append(res)
    return extract

def sent_to_pos(sent):
    return ' '.join([t['upos'] for t in sent])

DBG_SENT_COUNT = 9
DBG_SENT_LEN = (4, 5)

dbg_output = []
_, _ = query_subtree(
    dbg_sent.to_tree(),
    extract_sentences_of_len(dbg_sent, DBG_SENT_LEN, dbg_output))
[(s.metadata['text'], sent_to_pos(s)) for s in dbg_output]

[]

In [12]:
short_sentences = []
for sent in corpus:
    query_subtree(
        sent.to_tree(),
        extract_sentences_of_len(sent, DBG_SENT_LEN, short_sentences))
print(len(short_sentences))
[(s.metadata['text'], sent_to_pos(s)) for s in short_sentences[:DBG_SENT_COUNT]]

2942


[('( الولايات المتحدة )', 'PUNCT NOUN ADJ PUNCT'),
 ('( اف ب )', 'PUNCT PROPN PROPN PUNCT'),
 ('ورث 300 الف دولار', 'VERB NUM NUM NOUN'),
 ('( 45 عاما )', 'PUNCT NUM NOUN PUNCT'),
 ('( شمال شرق )', 'PUNCT NOUN NOUN PUNCT'),
 ('تجوب كل الولايات الاميركية', 'VERB NOUN NOUN ADJ'),
 ('وهو يصعد الباص', 'CCONJ PRON VERB NOUN'),
 ('زجاجات النبيذ والبيرة', 'NOUN NOUN CCONJ NOUN'),
 ('في كونتية لوس انجليس', 'ADP NOUN PROPN PROPN')]

In [13]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

model_name = "bert-base-uncased"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_dataset = datasets.DenoisingAutoEncoderDataset([sent_to_pos(s) for s in short_sentences])
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
train_loss = losses.DenoisingAutoEncoderLoss(
    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler="constantlr",
    optimizer_params={"lr": 3e-5},
)

When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.de

Step,Training Loss


In [14]:
short_sentences_embedded = model.encode(list(map(sent_to_pos, short_sentences)))
print(short_sentences_embedded.shape)

print('\n'.join([s.metadata['text'] for s in short_sentences[:DBG_SENT_COUNT]]))
model.similarity(embeddings[:DBG_SENT_COUNT,:], embeddings[:DBG_SENT_COUNT,:])

(2942, 768)
( الولايات المتحدة )
( اف ب )
ورث 300 الف دولار
( 45 عاما )
( شمال شرق )
تجوب كل الولايات الاميركية
وهو يصعد الباص
زجاجات النبيذ والبيرة
في كونتية لوس انجليس


tensor([[1.0000, 0.9208, 0.8642],
        [0.9208, 1.0000, 0.9015],
        [0.8642, 0.9015, 1.0000]])

### Cluster Embeddings

In [15]:
from sklearn.cluster import HDBSCAN

hdb = HDBSCAN()
hdb.fit(short_sentences_embedded)

In [16]:
from collections import defaultdict
clusters = defaultdict(lambda: [])
for sent, prob, label in zip(short_sentences, hdb.probabilities_, hdb.labels_):
    clusters[label].append((sent, prob))
clusters = {k: v for _, k, v in sorted([(len(v), k, v) for k, v in clusters.items()])}

with open('tsdae.csv', 'w') as f:
    f.write(f"CLUSTER_COUNT={len(clusters)}\n")
    for k, v in clusters.items():
        f.write(f"\nCluster {k:03d}:\n")
        for s, p in v:
            f.write(s.metadata['text'])
            f.write('\n')