# Synthesize

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_path = "../../results/checkpoint-2000"
translator = pipeline(
    "translation_ar_to_en",
    model=AutoModelForSeq2SeqLM.from_pretrained(model_path),
    tokenizer=AutoTokenizer.from_pretrained(model_path))

translator(['يقول ذلك جمال عبد الناصر'] * 3)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'translation_text': 'Rr0 Do1 Ps1 Pf2 Pf2'},
 {'translation_text': 'Rr0 Do1 Ps1 Pf2 Pf2'},
 {'translation_text': 'Rr0 Do1 Ps1 Pf2 Pf2'}]

In [2]:
corpus = []
with open('../ghani.txt', 'r', encoding='utf-8') as feat:
    for i, line in enumerate(feat):
        line = line.strip()[1:-1]
        corpus.append(line)
corpus[910:915]

['أعْلَنَتِ الشَّرِكَةُ عَنْ بِضَاعَتِهَا بِوَاسِطَةِ إِعْلاَناتٍ تِجَارِيَّةٍ',
 'أُنَاشِدُكُمْ أَنْ نَعْمَلَ كَأُمَّةٍ جُمِعَتْ كَلِمَتُهَا وَوُحِّدَتْ غَايَتُهَا',
 'إِنَّ اللَّهَ لاَ يُغَيِّرُ مَا بِقَوْمٍ حَتَّى يُغَيِّرُوا مَا بِأَنْفُسِهِمْ',
 'إِنَّ النّبِيَّ تَوَضَّأَ فَضَاقَ عَنْ يَدَيْهِ كُمَّا جِمَازَةٍ كَانَتْ عَلَيْه',
 'إِنْ كَانَ لَسِناً سُمِّيَ مِهْذَاراً وإِنْ كَانَ صَمُوتاً سُمِّيَ عَيِيّاً']

In [3]:
corpus_txfm = {corpus[i]: v['translation_text'] for i, v in enumerate(translator(corpus))}
list(corpus_txfm.items())[910:915]

[('أعْلَنَتِ الشَّرِكَةُ عَنْ بِضَاعَتِهَا بِوَاسِطَةِ إِعْلاَناتٍ تِجَارِيَّةٍ',
  "Rr0 No1 Lq3! No2 O'3 Lq2! Ni1 N'2 Ja3"),
 ('أُنَاشِدُكُمْ أَنْ نَعْمَلَ كَأُمَّةٍ جُمِعَتْ كَلِمَتُهَا وَوُحِّدَتْ غَايَتُهَا',
  "Rr0 Oo1 Sy2! Rx1 Lq3! No2 N'3 Rc4 No5 O'6 Ce6! Rc5 No6 O'7"),
 ('إِنَّ اللَّهَ لاَ يُغَيِّرُ مَا بِقَوْمٍ حَتَّى يُغَيِّرُوا مَا بِأَنْفُسِهِمْ',
  "Xw1! Nr0 Vv2! Rx1 Oo2 Lq4! No3 Lq3! Rc2 Oo3 Lq5! No4 O'5"),
 ('إِنَّ النّبِيَّ تَوَضَّأَ فَضَاقَ عَنْ يَدَيْهِ كُمَّا جِمَازَةٍ كَانَتْ عَلَيْه',
  'Xw1! Nr0 Rc1 Rx1 Lq3! Oo2 Rx3 Oo4 Lq5! Oi4 Nm5 Xw6! Lq6! Oo5'),
 ('إِنْ كَانَ لَسِناً سُمِّيَ مِهْذَاراً وإِنْ كَانَ صَمُوتاً سُمِّيَ عَيِيّاً',
  'Sy1! Xw1! Nr0 Rc1 No2 Ce2! Sy2! Xw2! No1 Rx2 Nm3')]

In [4]:
import csv
with open(f'final_corpus_mine.csv', 'w', encoding='utf-8') as feat:
    writer = csv.writer(feat, quoting=csv.QUOTE_NONNUMERIC)
    for sent, txfm in corpus_txfm.items():
        writer.writerow([sent, txfm])

# TSDAE

In [5]:
!pip install -Uqq sentence_transformers

from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader
import random

train_sentences = list(corpus_txfm.values())
train_sentences = random.sample(train_sentences, 1000)
print('\n'.join(train_sentences[:3]))

model_name = "bert-base-uncased"
# model_name = "google-bert/bert-base-multilingual-cased"
# model_name = "distilbert/distilroberta-base"
# model_name = "CAMeL-Lab/bert-base-arabic-camelbert-ca"
# model_name = "CAMeL-Lab/bert-base-arabic-camelbert-msa-sixteenth"

word_embedding_model = models.Transformer(model_name)
embedding_len = word_embedding_model.get_word_embedding_dimension()
print(embedding_len)

pooling_model = models.Pooling(embedding_len, "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
train_loss = losses.DenoisingAutoEncoderLoss(
    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    weight_decay=0,
    scheduler="constantlr",
    optimizer_params={"lr": 3e-5},
    show_progress_bar=True,
    # output_path=f'ar_nyuad-ud/final_{SYNTH_METHOD}',
    save_best_model=True,
)
# 1m30s

Rr0 Ns1
Nr0 N'1
Rr0 Lq2! No1 Lq3! No2
Rr0 Ps1 Pf2 Lq2! No1 ?'2
Rr0 Lq2! No1 Nm1
Nr0 Ja1
Rr0 Rx1 Lq3! Oo2 Lq3! Ni2
Rr0 No1
Nr0 Lq2! Po1 Lq2! Oi1
Nr0 Ja1
768


When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.de

Step,Training Loss


In [6]:
embeddings = model.encode(list(corpus_txfm.values())[:10_000])
print(embeddings.shape)
print('\n'.join(list(corpus_txfm.keys())[:10]))
model.similarity(embeddings[:10,:], embeddings[:10,:])

(10000, 768)
لِكُلِّ إِنْسانٍ الحَقُّ في الوُجودِ والحُرِّيَّةِ والأمْنِ، والحَقُّ في حُرِّيَّةِ الرَّأْيِ والتَّعْبيرِ بِدونِ تَمْيِيزٍ بِسَبَبِ العُنْصُرِ أَوِ اللَّوْنِ أَوِ الجِنْسِ أَوِ الثَّقافَةِ أَوِ الدِّينِ أَوِ الرَّأْيِ
طَائِرٌ مِنْ فَصيلَةِالوَزِّيَّاتِ، مِنْ رُتْبةِ الكَفِّيَّاتِ، لَهُ مِنْقارٌ عَريضٌ مُلَوَّنٌ، قَصيرُ العُنُقِ والرِّجْلَيْنِ، طَويلُ الأجْنِحَةِ، يَخْتَلِفُ عَنِ الإِوَزِّ، وَإِنْ كانَ يُشْبِهُهُ وَهُوَ طائِرٌ مائِيٌّ لَهُ قُدْرَةٌ على الطَّيَران
يَبْلُغُ عَدَدُ حُروفِ هِجاءِ اللُّغَةِ العَرَبِيَّةِ ثَمانِيَةً وَعِشْرينَ حَرْفاً هي: ا ب ت ث ج ح خ د ذ ر ز س ش ص ض ط ظ ع غ ف ق ك ل م ن هـ و ي. وَهُوَ ما يُسَمَّى بِحُروفِ الْمَبانِ
اِنْقَشَعَ السّحابُ» لا مَحَلَّ لها مِنَ الإِعْرابِ لأَنَّها جُمْلَةٌ اِبْتِدائِيَّةٌ، والجُمَلُ التي لها مَحَلٌّ مِنَ الإِعْرابِ، هي التي تَحَلُّ مَحَلَّ مُفْرَدٍ، أي ما لَيْس جُمْلَةً ولا شِبْهَ جُمْلَة
جَمَاعَةٌ مِنَ النَّاسِ تَجْمَعُهُمْ رَوَابِطُ تَارِيخِيَّةٌ مُشْتَرَكَةٌ، قَدْ يَكُونُ فِيهَا مَا هُوَ لُغَوِيٌّ أوْ دِينِيٌّ أوِ

tensor([[1.0000, 0.9422, 0.9248, 0.9445, 0.9305, 0.9709, 0.9314, 0.9592, 0.9727,
         0.9328],
        [0.9422, 1.0000, 0.9316, 0.9654, 0.9668, 0.9529, 0.9735, 0.9668, 0.9835,
         0.9805],
        [0.9248, 0.9316, 1.0000, 0.9453, 0.9062, 0.9276, 0.8865, 0.9322, 0.9332,
         0.9078],
        [0.9445, 0.9654, 0.9453, 1.0000, 0.9601, 0.9428, 0.9244, 0.9472, 0.9611,
         0.9571],
        [0.9305, 0.9668, 0.9062, 0.9601, 1.0000, 0.9481, 0.9578, 0.9613, 0.9589,
         0.9616],
        [0.9709, 0.9529, 0.9276, 0.9428, 0.9481, 1.0000, 0.9457, 0.9690, 0.9774,
         0.9321],
        [0.9314, 0.9735, 0.8865, 0.9244, 0.9578, 0.9457, 1.0000, 0.9638, 0.9731,
         0.9708],
        [0.9592, 0.9668, 0.9322, 0.9472, 0.9613, 0.9690, 0.9638, 1.0000, 0.9760,
         0.9535],
        [0.9727, 0.9835, 0.9332, 0.9611, 0.9589, 0.9774, 0.9731, 0.9760, 1.0000,
         0.9678],
        [0.9328, 0.9805, 0.9078, 0.9571, 0.9616, 0.9321, 0.9708, 0.9535, 0.9678,
         1.0000]])

In [7]:
from sklearn.cluster import AffinityPropagation
affinity = AffinityPropagation(random_state=5).fit(embeddings)



In [8]:
from sklearn.cluster import Birch
birch = Birch(n_clusters=None).fit(embeddings)

In [9]:
from sklearn.cluster import HDBSCAN
hdb = HDBSCAN(min_cluster_size=3, max_cluster_size=50, n_jobs=8).fit(embeddings)

In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=int(len(embeddings)**0.5) * 8).fit(embeddings)

In [11]:
from sklearn.cluster import MeanShift
meanshift = MeanShift(bandwidth=2).fit(embeddings)

In [12]:
from sklearn.cluster import OPTICS
optics = OPTICS(min_samples=3).fit(embeddings)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [13]:
from collections import defaultdict

def save_clustering(algorithm, labels):
    clusters = defaultdict(lambda: [])
    for sent, label in zip(corpus_txfm.keys(), labels):
        clusters[label].append(sent)
    
    if -1 not in clusters:
        clusters[-1] = []
    for ss in clusters.values():
        if len(ss) == 1:
            clusters[-1].extend(ss)

    clusters = {k: v for k, v in clusters.items() if len(v) > 1 or k == -1}
    clusters = {k: v for _, v, k in sorted([(len(v), sorted(v), k) for k, v in clusters.items()])}
    n_unclustered = len(clusters[-1])
    
    path = f'final_cluster_mine_{algorithm}.txt'
    print(path)
    with open(path, 'w') as file:
        file.write(f"CLUSTER_COUNT = {len(clusters)}\n")
        file.write(f"UNCLUSTERED = {n_unclustered:,} / {sum(map(len, clusters.values())):,}\n")
        for label, sents in clusters.items():
            file.write(f"\nCLUSTER {label} ({len(sents):,} sentences)\n")
            for sent in sents:
                if label == -1:
                    file.write(f" xx {sent}\n")
                else:
                    file.write(f"    {sent}\n")
    return clusters


save_clustering("affinity", affinity.labels_)
save_clustering("birch", birch.predict(embeddings))
save_clustering("hdbscan", hdb.labels_)
save_clustering("kmeans", kmeans.labels_)
save_clustering("meanshift", meanshift.labels_)
save_clustering("optics", optics.labels_)

final_cluster_mine_affinity.txt
final_cluster_mine_birch.txt
final_cluster_mine_hdbscan.txt
final_cluster_mine_kmeans.txt
final_cluster_mine_meanshift.txt
final_cluster_mine_optics.txt


{136: ['آثارُ الحُزْنِ بادِيَةٌ على وُجوهِهِمْ',
  'حَيَاتُهُ مُكَرَّسَةٌ لأَعْمَالِ الخَيْر',
  'دَلائِلُ الاسْتِغْرابِ بادِيَةٌ على وُجوهِهِمْ'],
 577: ['أبِيَ اللَّحْمَ البَائِتَ أَوْ مِنْهُ',
  'بَوَّأهُ الْمَكَانَةَ اللاَّئِقَةَ بِهِ',
  'عَقِيدَتُهُ عَقِيدَةٌ رَاسِخَةٌ لاَ يَحِيدُ عَنْهَا'],
 679: ['أبْدَعَتِ الطَّبِيعَةُ مَنَاظِرَ خَلابَةً سُبْحَانَ خَالِقِهَا',
  'تَتَكَوَّنُ مَوْجَاتُ عَجَاجٍ تَقْذِفُهَا الرِّيحُ',
  'تَوَاصَفَ السَّائِحُونَ آثَارَ الْمُدُنِ الَّتِي زَارُوها'],
 508: ['أتْلَفَتِ النَّارُ كُلَّ أثَاثِ البَيْتِ',
  'تَزَاحَفَ الجُنْدُ صَوْبَ سَاحَةِ الْمَعْرَكَةِ',
  'حَضَرَ الاجْتِمَاعَ كُلُّ أَعْضَاءِ الْجَمْعِيَّةِ'],
 304: ['أتْمَمْتُ السَّبْعِينَ الأولَى مِنْ عُمْرِي',
  'اِنْتَهَتِ الجَوْلَةُ الأُولَى بِفَوْزِهِ',
  'نَشَأَتْ مُمَانَعَةٌ حَقِيقِيَّةٌ بَيْنَهُمَا'],
 574: ['أثْبَتَ الوَقَاِئعَ كَمَا شَاهَدَهَا',
  'أعْتَقِدُ أنِّي أحْسَنْتُ الاخْتِيَارَ',
  'اِعْتَرَكَتِ الجَماعَةُ فيما بَيْنَها'],
 180: ['أجْرَى التَّاجِرُ حَسْماً لِمُشْترَيَاتِ زَبُونِهِ'