In [1]:
from importlib.metadata import version

## 当前代码需要用到的包
pkgs = ['datasets', 'sentence_transformers', 'nltk']

for pkg in pkgs:
    print(f"{pkg}:", version(pkg))


import os
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['TRANSFORMERS_CACHE'] = "/root/autodl-tmp/LLMs/.cache/huggingface"
os.environ['HF_HOME'] = "/root/autodl-tmp/LLMs/.cache/huggingface"

import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

datasets: 4.0.0
sentence_transformers: 5.1.0
nltk: 3.9.1


# Transformer-Based Sequential Denoising Auto-Encoder(TSDAE)

In [5]:
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Download additional tokenizer
import nltk
# 从 https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip 下载解压到下面地址其一 ./tokenizers下面
nltk.download('punkt_tab')

# create a flat list of sentences
mnli = load_dataset('glue', 'mnli', split='train').select(range(25000))
flat_sentences = list(mnli['premise']) + list(mnli['hypothesis'])

# add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# create dataset
train_dataset = {'damaged_sentence': [], 'original_sentence': []}
for data in tqdm(damaged_data):
    train_dataset['damaged_sentence'].append(data.texts[0])
    train_dataset['original_sentence'].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# create your embedding model
word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder=True
)
train_loss.decoder = train_loss.decoder.to('cuda')

# define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='tsdae_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train(resume_from_checkpoint=True)


[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self-signed certificate in certificate chain
[nltk_data]     (_ssl.c:1000)>
100%|██████████| 48353/48353 [00:06<00:00, 7513.89it/s]
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.cr

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


TrainOutput(global_step=1512, training_loss=0.0, metrics={'train_runtime': 0.0093, 'train_samples_per_second': 5171937.401, 'train_steps_per_second': 161726.663, 'total_flos': 0.0, 'train_loss': 0.0, 'epoch': 1.0})

In [6]:
evaluator(embedding_model)

{'pearson_cosine': 0.7356149412334347, 'spearman_cosine': 0.743302479153145}

In [3]:
import gc
import torch
del embedding_model

# Flush memory gc.collect() torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()