In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/ReverseEmbedding/GEIA

/content/drive/MyDrive/ReverseEmbedding/GEIA


### Code setup

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.spatial.distance import cosine

import json
import numpy as np
import pandas as pd
import argparse
import sys

from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from attacker_models import SequenceCrossEntropyLoss
from sentence_transformers import SentenceTransformer
from simcse_persona import get_persona_dict
from attacker_evaluation_gpt import eval_on_batch
from datasets import load_dataset
from data_process import get_sent_list

class linear_projection(nn.Module):
    def __init__(self, in_num, out_num=1024):
        """Mapping between embedding dimension and GPT token representation dimension"""
        super(linear_projection, self).__init__()
        self.fc1 = nn.Linear(in_num, out_num)

    def forward(self, x, use_final_hidden_only = True):
        # x should be of shape (?,in_num) according to gpt2 output
        out_shape = x.size()[-1]
        assert(x.size()[1] == out_shape)
        out = self.fc1(x)


        return out


class personachat(Dataset):
    def __init__(self, data):
        self.data = data


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        text = self.data[index]

        return  text

    def collate(self, unpacked_data):
        return unpacked_data

def process_data(data,batch_size,device,config,need_porj=True):
    #model = SentenceTransformer('all-roberta-large-v1',device=device)   # dim 1024
    device_1 = torch.device("cuda:0")
    model = SentenceTransformer(config['embed_model_path'],device=device_1)   # dim 768
    dataset = personachat(data)
    dataloader = DataLoader(dataset=dataset,
                              shuffle=True,
                              batch_size=batch_size,
                              collate_fn=dataset.collate)

    print('load data done')
    ### extra projection
    if need_porj:
        """This is automatically true"""
        projection = linear_projection(in_num=768, out_num=1280).to(device)
        if config.get('resume', False):
          proj_path = f"models/projection_diablogpt_large_{config['dataset']}_{config['embed_model']}"
          if os.path.exists(proj_path):
              projection.load_state_dict(torch.load(proj_path))
              print(f"✅ Loaded projection from {proj_path}")
    ### for attackers

    """The default attacker model is DialoGPT-large"""
    model_attacker = AutoModelForCausalLM.from_pretrained(config['model_dir'])
    if config.get('resume', False):
      attacker_path = f"models/attacker_diablogpt_large_{config['dataset']}_{config['embed_model']}"
      if os.path.exists(attacker_path):
          model_attacker = AutoModelForCausalLM.from_pretrained(attacker_path).to(device)
          print(f"✅ Loaded attacker model from {attacker_path}")
    tokenizer_attacker = AutoTokenizer.from_pretrained(config['model_dir'])
    criterion = SequenceCrossEntropyLoss()
    model_attacker.to(device)
    param_optimizer = list(model_attacker.named_parameters())

    # print("Param optimizer length:", param_optimizer)
    no_decay = ['bias', 'ln', 'LayerNorm.weight']

    """Exclude bias and layernorm from weight decay"""
    optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}

    ]

    # print("Optimizer grouped parameters length:", optimizer_grouped_parameters)

    num_gradients_accumulation = 1
    """How many batches to accumulate before performing an optimizer step"""

    num_epochs = config['num_epochs']
    batch_size = config['batch_size']

    num_train_optimization_steps  = len(dataloader) * num_epochs // num_gradients_accumulation
    """Each batch count as one step, the total number of steps is num_epochs * number of batches"""

    optimizer = AdamW(optimizer_grouped_parameters,
                  lr=3e-5,
                  eps=1e-06)
    if need_porj:
        optimizer.add_param_group({'params': projection.parameters()})

    """This is a linear scheduler with warmup
    In the total number of training steps, the first 100 steps are used for warmup
    increasing the learning rate linearly from 0 to the initial lr set in the optimizer

    After the warmup period, the learning rate decreases linearly from the initial lr to 0"""
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=100,
                                            num_training_steps=num_train_optimization_steps)


    start_epoch = 0

    if config.get('resume', False) and os.path.exists("models/checkpoint_epoch.pt"):
        checkpoint = torch.load("models/checkpoint_epoch.pt")
        optimizer.load_state_dict(checkpoint['optimizer_state'])
        scheduler.load_state_dict(checkpoint['scheduler_state'])
        start_epoch = checkpoint['epoch'] + 1
        print("✅ Loaded optimizer and scheduler state")

    ### process to obtain the embeddings
    print("Starting from epoch", start_epoch)

    for i in range(start_epoch, num_epochs):
        epoch_bar = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc=f"Epoch {i+1}/{num_epochs}",
                      unit="batch")
        model.eval()
        for idx,batch_text in epoch_bar:
            with torch.no_grad():
                embeddings = model.encode(batch_text,convert_to_tensor = True).to(device)
                print(f'Embedding dim: {embeddings.size()}')

            ### attacker part, needs training
            if need_porj:
               embeddings = embeddings.clone().detach()
               embeddings = projection(embeddings)

            record_loss, perplexity = train_on_batch(batch_X=embeddings,batch_D=batch_text,model=model_attacker,tokenizer=tokenizer_attacker,criterion=criterion,device=device,train=True)
            optimizer.step()
            scheduler.step()
            # make sure no grad for GPT optimizer
            optimizer.zero_grad()
            print(f'Training: epoch {i} batch {idx} with loss: {record_loss} and PPL {perplexity} with size {embeddings.size()}')
            #sys.exit(-1)

        """Saves after each epoch"""
        if need_porj:
            proj_path = 'models/' + 'projection_diablogpt_large_' + config['dataset'] + '_' + config['embed_model']
            torch.save(projection.state_dict(), proj_path)
        save_path = 'models/' + 'attacker_diablogpt_large_' + config['dataset'] + '_' + config['embed_model']
        print(save_path)
        model_attacker.save_pretrained(save_path)
        torch.save({
            'epoch': i,
            'optimizer_state': optimizer.state_dict(),
            'scheduler_state': scheduler.state_dict(),
        }, f"models/checkpoint_epoch.pt")


### used for testing only
def process_data_test(data,batch_size,device,config,need_proj=True):
    #model = SentenceTransformer('all-roberta-large-v1',device=device)   # dim 1024
    #model = SentenceTransformer(config['embed_model_path'],device=device)   #  dim 768
    device_1 = torch.device("cuda:0")
    print(device_1)

    """This loads the embedding model"""
    model = SentenceTransformer(config['embed_model_path'],device=device_1)   # dim 768
    if(config['decode'] == 'beam'):
        save_path = 'models/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']+'_beam'+'.log'
    else:
        save_path = 'models/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']+'.log'
    dataset = personachat(data)
    # no shuffle for testing data
    dataloader = DataLoader(dataset=dataset,
                              shuffle=False,
                              batch_size=batch_size,
                              collate_fn=dataset.collate)

    print('load data done')

    """Here it tries to load a projection model if there is a checkpoint"""
    if need_proj:
        proj_path = 'models/' + 'projection_gpt2_large_' + config['dataset'] + '_' + config['embed_model']
        projection = linear_projection(in_num=768, out_num=1280)
        projection.load_state_dict(torch.load(proj_path))
        projection.to(device)
        print('load projection done')
    else:
        print('no projection loaded')
    # setup on config for sentence generation   AutoModelForCausalLM
    attacker_path = 'models/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']
    config['model'] = AutoModelForCausalLM.from_pretrained(attacker_path).to(device)
    config['tokenizer'] = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')

    # sent_dict = {}
    # sent_dict['gt'] = []
    # sent_dict['pred'] = []
    # with torch.no_grad():
    #     for idx,batch_text in enumerate(dataloader):

    #         embeddings = model.encode(batch_text,convert_to_tensor = True).to(device)

    #         if need_proj:
    #             embeddings = projection(embeddings)

    #         sent_list, gt_list = eval_on_batch(batch_X=embeddings,batch_D=batch_text,model=config['model'],tokenizer=config['tokenizer'],device=device,config=config)
    #         print(f'testing {idx} batch done with {idx*batch_size} samples')
    #         sent_dict['pred'].extend(sent_list)
    #         sent_dict['gt'].extend(gt_list)

    #     with open(save_path, 'w') as f:
    #         json.dump(sent_dict, f,indent=4)

    # return 0


### used for testing only
def process_data_test_simcse(data,batch_size,device,config,proj_dir=None,need_proj=False):
    tokenizer = AutoTokenizer.from_pretrained(config['embed_model_path'])  # dim 1024
    model = AutoModel.from_pretrained(config['embed_model_path']).to(device)
    #save_path = 'logs/attacker_gpt2_qnli_simcse_bert_large.log'
    if(config['decode'] == 'beam'):
        print('Using beam search decoding')
        save_path = 'logs/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']+'_beam'+'.log'
    else:
        save_path = 'logs/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']+'.log'
    dataset = personachat(data)
    # no shuffle for testing data
    dataloader = DataLoader(dataset=dataset,
                              shuffle=False,
                              batch_size=batch_size,
                              collate_fn=dataset.collate)

    print('load data done')
    if need_proj:
        projection = linear_projection(in_num=768)
        projection.load_state_dict(torch.load(proj_dir))
        projection.to(device)
        print('load projection done')
    else:
        print('no projection loaded')
    # setup on config for sentence generation   AutoModelForCausalLM
    attacker_path = 'models/' + 'attacker_gpt2_large_' + config['dataset'] + '_' + config['embed_model']
    config['model'] = AutoModelForCausalLM.from_pretrained(attacker_path).to(device)
    config['tokenizer'] = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')

    sent_dict = {}
    sent_dict['gt'] = []
    sent_dict['pred'] = []
    with torch.no_grad():
        for idx,batch_text in enumerate(dataloader):
            inputs = tokenizer(batch_text, padding=True, truncation=True, return_tensors="pt").to(device)
            embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
            if need_proj:
                embeddings = projection(embeddings)
            #sent_list, gt_list = eval_on_batch(batch_X=embeddings,batch_D=batch_text,model=config['model'],tokenizer=config['tokenizer'],device=device,config=config)
            sent_list, gt_list = eval_on_batch(batch_X=embeddings,batch_D=batch_text,model=config['model'],tokenizer=config['tokenizer'],device=device,config=config)
            print(f'testing {idx} batch done with {idx*batch_size} samples')
            sent_dict['pred'].extend(sent_list)
            sent_dict['gt'].extend(gt_list)
        with open(save_path, 'w') as f:
            json.dump(sent_dict, f,indent=4)

    return 0


def train_on_batch(batch_X,batch_D,model,tokenizer,criterion,device,train=True):
    padding_token_id = tokenizer.encode(tokenizer.eos_token)[0]
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer(batch_D, return_tensors='pt', padding='max_length', truncation=True, max_length=40)
    #dial_tokens = [tokenizer.encode(item) + turn_ending for item in batch_D]
    #print(inputs)
    input_ids = inputs['input_ids'].to(device) # tensors of input ids
    labels = input_ids.clone()
    #print(input_ids.size())
    # embed the input ids using GPT-2 embedding
    input_emb = model.transformer.wte(input_ids)
    # add extra dim to cat together
    batch_X = batch_X.to(device)
    batch_X_unsqueeze = torch.unsqueeze(batch_X, 1)
    inputs_embeds = torch.cat((batch_X_unsqueeze,input_emb),dim=1)   #[batch,max_length+1,emb_dim (1024)]
    past = None
    # need to move to device later
    inputs_embeds = inputs_embeds

    #logits, past = model(inputs_embeds=inputs_embeds,past = past)
    logits, past = model(inputs_embeds=inputs_embeds,past_key_values  = past,return_dict=False)
    logits = logits[:, :-1].contiguous()
    target = labels.contiguous()
    target_mask = torch.ones_like(target).float()
    loss = criterion(logits, target, target_mask, label_smoothing=0.02, reduce="batch")

    record_loss = loss.item()
    perplexity = np.exp(record_loss)
    if train:
        loss.backward()

    return record_loss, perplexity


### Training

In [4]:
times = 10
model_cards ={}
model_cards['sent_t5_large'] = 'sentence-t5-large'
model_cards['sent_t5_base'] = 'sentence-t5-base'
model_cards['sent_t5_xl'] = 'sentence-t5-xl'
model_cards['sent_t5_xxl'] = 'sentence-t5-xxl'
model_cards['mpnet'] = 'all-mpnet-base-v1'
model_cards['sent_roberta'] = 'all-roberta-large-v1'
model_cards['simcse_bert'] = 'princeton-nlp/sup-simcse-bert-large-uncased'
model_cards['simcse_roberta'] = 'princeton-nlp/sup-simcse-roberta-large'
parser = argparse.ArgumentParser(description='Training external NN as baselines')
parser.add_argument('--model_dir', type=str, default='microsoft/DialoGPT-large', help='Dir of your model')
parser.add_argument('--resume', type=bool, default=True, help='Resume training from checkpoint')
parser.add_argument('--resume_epoch', type=int, default=0, help='Epoch number to resume from')
parser.add_argument('--num_epochs', type=int, default=10, help='Training epoches.')
parser.add_argument('--batch_size', type=int, default=16, help='Batch_size #.')
parser.add_argument('--dataset', type=str, default='personachat', help='Name of dataset: personachat or qnli')
#parser.add_argument('--dataset', type=str, default='qnli', help='Name of dataset: personachat or qnli')
#parser.add_argument('--data_type', type=str, default='train', help='train/test')
parser.add_argument('--data_type', type=str, default='train', help='train/test')
parser.add_argument('--embed_model', type=str, default='sent_t5_base', help='Name of embedding model: mpnet/sent_roberta/simcse_bert/simcse_roberta/sent_t5')
parser.add_argument('--decode', type=str, default='beam', help='Name of decoding methods: beam/sampling')
#parser.add_argument('--embed_model', type=str, default='simcse_roberta', help='Name of embedding model: mpnet/sent_roberta/simcse_bert/simcse_roberta/sent_t5')

args, unknown = parser.parse_known_args()

config = {}
config['model_dir'] = args.model_dir
config['num_epochs'] = args.num_epochs
config['batch_size'] = args.batch_size
config['dataset'] = args.dataset
config['data_type'] = args.data_type
config['embed_model'] = args.embed_model
config['decode'] = args.decode
config['embed_model_path'] = model_cards[config['embed_model']]
config['device'] = torch.device("cuda")
config['tokenizer'] = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')
config['eos_token'] = config['tokenizer'].eos_token
config['use_opt'] = False
config['resume'] = True
# config['resume_epoch'] =  3

device = torch.device("cuda:0")
#device = torch.device("cpu")
batch_size = config['batch_size']

"""Using persona chat dataset"""
sent_list = get_sent_list(config)

# print(len(sent_list))
# print(sent_list[0:32])

"""We will try limit sent list to 2 batches for debugging"""
# sent_list = sent_list[0:batch_size]

##### for training
# if(config['data_type'] == 'train'):
#     """Train the model with pretrained weights (probably will need to look at the code later)"""
#     process_data(sent_list,batch_size,device,config)
# elif(config['data_type'] == 'test'):
#     if('simcse' in config['embed_model']):
#         process_data_test_simcse(sent_list,batch_size,device,config,proj_dir=None,need_proj=False)
#     else:
#         """We haven't trained so we don't have a projection yet"""
#         process_data_test(sent_list,batch_size,device,config,need_proj=False)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

'We will try limit sent list to 2 batches for debugging'

In [5]:
#Load the model
def load_model(config):
    attacker_path = f"models/attacker_diablogpt_large_{config['dataset']}_{config['embed_model']}"
    if os.path.exists(attacker_path):
        model_attacker = AutoModelForCausalLM.from_pretrained(attacker_path).to(device)
        print(f"✅ Loaded attacker model from {attacker_path}")
    tokenizer_attacker = AutoTokenizer.from_pretrained(config['model_dir'])
    model_attacker.to(device)

    model_attacker.eval()

    projection = linear_projection(in_num=768, out_num=1280).to(device)
    proj_path = f"models/projection_diablogpt_large_{config['dataset']}_{config['embed_model']}"
    if os.path.exists(proj_path):
        projection.load_state_dict(torch.load(proj_path))
        projection.eval()
        print(f"✅ Loaded projection from {proj_path}")

    return model_attacker, tokenizer_attacker, projection

In [7]:
# attacker_model, tokenizer, projection = load_model(config)

### Inference testing
Scenario 1: Noise in separated sentence

RAW data: what grade are you in?

Noised RAW data: what grade are you in? HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering.

- Step 1: Train the attacker model with RAW data.
- Step 2: Feed the embedding vector of Noised RAW data to the attacker model.
- Step 3: Check the output whether the RAW data can be recovered?


In [None]:
device_1 = torch.device("cuda:0")
model = SentenceTransformer(config['embed_model_path'],device=device_1)

### Caveats
This was trained only on salted embeddings, but not original + salt embeddings

This might lead to bad results, but i'm not sure if it will stay that way when i retrain the model, since this addition seems way easier to reverse. Using just 1 salt to trivial so we will try to use at least 5 different salt to retrain the model.

But is it good though? it will introduces mixed training signals
where the different embeddings map to the same sentence

In [None]:
sample_sentence = ["What grade are you in? HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."]


def inference(sample_sentence, model, projection, attacker_model, use_sampling=False, top_k=0, noise=None, x2 = False, concat = False):
  """Not recommneded for batch inference"""
  model.eval()
  projection.eval()
  attacker_model.eval()


  """Code will break if it is a list on input"""
  if concat:
    if sample_sentence.strip().endswith('.'):
      sample_sentence = sample_sentence * 2
    else:
      sample_sentence = sample_sentence + ". " + sample_sentence + "."

  if type(sample_sentence) is not list:
    sample_sentence = [sample_sentence]
  with torch.no_grad():
      #  Encode and project initial embedding
      print(f"Processing sample sentence: {sample_sentence}")
      embeddings = model.encode(sample_sentence, convert_to_tensor=True).to(device)
      if noise is not None:
          # print("Original embeddings: ", embeddings.shape)
          # print("Noise: ", noise)
          # a = embeddings.clone()
          embeddings = embeddings - noise
          # if torch.all(embeddings + noise == a): print("True")

      if x2:
        embeddings = embeddings * 2
      embeddings = projection(embeddings).unsqueeze(1)  # [batch=1, seq_len=1, hidden_dim]

      #  Initialize generation loop
      generated_embeds = embeddings
      past_key_values = None
      max_new_tokens = 50   # how many tokens to generate
      temperature = 1.0     # can adjust for diversity

      #  Iteratively generate
      for _ in range(max_new_tokens):
          # Forward through model
          outputs = attacker_model(
              inputs_embeds=generated_embeds,
              past_key_values=past_key_values,
              use_cache=True,
              return_dict=True
          )

          logits = outputs.logits       # [batch, seq_len, vocab_size]
          past_key_values = outputs.past_key_values

          #  Get next-token prediction (last position)
          next_token_logits = logits[:, -1, :]

          if use_sampling:
              if temperature is not None and temperature > 0:
                  next_token_logits = next_token_logits / temperature
              # Optional top-k filtering
              if top_k is not None and top_k > 0:
                  top_logits, _ = torch.topk(next_token_logits, top_k)
                  min_logits = top_logits[:, -1].unsqueeze(-1)
                  next_token_logits = torch.where(
                      next_token_logits < min_logits, torch.full_like(next_token_logits, -float('Inf')), next_token_logits
                  )

              # Sample from probability distribution
              probs = torch.softmax(next_token_logits, dim=-1)
              next_token_id = torch.multinomial(probs, num_samples=1)
          else:
              # Greedy
              # print(next_token_logits.size())
              next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
              # print(next_token_id)

          if _ == 0:
              output_ids = next_token_id
          else:
              output_ids = torch.cat((output_ids, next_token_id), dim=1)

          if len(next_token_id) == 1:
            """Batch size one, break when reached EOS"""
            if next_token_id[0] == tokenizer.eos_token_id:
              break

          #  Convert predicted token to embedding for next step
          next_token_emb = attacker_model.transformer.wte(next_token_id)

          # Only feed in the *new* token embedding next time
          generated_embeds = next_token_emb

          # (optional) collect IDs for printing later


      #  Decode generated tokens
      result = []
      for output_id in output_ids:
        generated_text = tokenizer.decode(output_id, skip_special_tokens=True) #This only outputs the first one
        result.append(generated_text)
      return result

In [None]:
## Test with data outside training vs data similar to training
# total = 0
# for i in range(1000):
#   res = inference(sent_list[i], model, projection, attacker_model)
#   if sent_list[i] == res[0]: total += 1
#   print(sent_list[i],"\n", res)

# total
print(model)
"""After 10 epochs the model has overfit the training data, generalization to external data is affected"""
noised = model.encode("i'm in 4th grade. HCMUT is a school", convert_to_tensor=True).to(device)
inference("i'm in 4th grade. HCMUT is a school", model, projection, attacker_model, noise = noised)

### Scoring based on semantic similarity
Using cosine similarity on SBERT embeddings

The models seems to have overfit on the training, the data is also conversational, naturally it doesn't generalize well to information or data outside of the scope it was trained on. Entity name reconstruction seems be a good noise factor

---
Result based on 10 epochs of training, evaluation metrics includes (SBERT-cosine, BERTscore, ROUGE-L) -- all these are added the noise `"HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."` within the sentence:
- Data taken from training datasets
  - Results using samples from training show that the model `overfit` the training data, when apply noise the similarity dropped significantly
- Data similar to the training datasets conversational (The provided perosonachat)
  - Base results is worse than using data from training, but the reconstruction is still partly reasonable, the overfitting problem may have affected the performance a little bit
  - In terms of noised reconstruction, the result is the same as case 1 - significantly worse than base reconstruction
- External general data
  - Base reconstruction is much worse compared to conversational data
  - Noised reconstruction is not much better either
---
Adding noise either in-sentence or outside of sentences seems to be pretty effective in preventing the true information from being recovered, there are some tradeoff between the 2 with each of them achieving better reconstruction result on certain samples, but generally, `In sentence yields better result for stopping  reverse embeddings attacks`

But the same effectiveness can't be said for larger models


Suspected reason:
- The training data is too limited a specific domain `(conversational only)`, making it hard to adapt to external forms of data `(more formal -- structured)`
- The embeddings capture general knowledge from the sentence, so information about named entities like `HCMUT` is usually lost and the decoder is unable to recover.
---
Try to train on a more general -- bigger dataset and try again,

In [8]:
!pip install bert_score rouge_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=359476a261166e501b9be834a75d699f931b28a9eed8686a922a83518fa678d7
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, bert_score
Successfully installed bert_score-0.3.13 rouge_score-0.1.2


In [9]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')  # small+fast; swap for larger if you want
def sbert_cosine(a, b):
    ea = sbert.encode(a, convert_to_tensor=False)
    eb = sbert.encode(b, convert_to_tensor=False)
    cos = np.dot(ea, eb) / (np.linalg.norm(ea) * np.linalg.norm(eb))
    return float(cos)


# 2) BERTScore
from bert_score import score

def bertscore_f1(hyps, refs, model_type='microsoft/deberta-xlarge-mnli'):
    P, R, F1 = score(hyps, refs, model_type=model_type, lang='en', rescale_with_baseline=True)
    return F1.numpy().tolist()  # list of floats


# 3) ROUGE-L
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
def rouge_l(a, b):
    return scorer.score(a, b)['rougeL'].fmeasure

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Noise in separate sentences

In [None]:
import random

np.random.seed(42)
random.seed(42)



noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."
# noise_embedding = model.encode(noise, convert_to_tensor=True).to(device)
# sampled = np.random.choice(sent_list, size=10, replace=False)  # sampled is an array of 10 items
lines = [
    "but i do take fridays off , when the weather gets warm . aka summer .",
    "wow that is great , i use to be in the military .",
    "cool ! i am more of a sports fan , love watching tennis or golf on tv",
    "ha , not for a long time ! i am what they call a baby boomer",
    "i am great , i just miss my dad today . how are you ?",
    "he is a sweet black lab",
    "hey there , how are you ?",
    "cool , so tell me more about you ?",
    "and spaghetti and meatballs right ? 1d and that would make the best night",
    "i heard that most accountants play the piano like i do"
]
# case1 = list(lines)


# # noisy_case1 = [
# #     s * times if s.strip().endswith('.') else (s + '.') * times for s in case1
# # ]

# noisy_case1 = [
#     s + ' ' + noise if s.strip().endswith('.') else (s + '. ') + noise for s in case1
# ]

# print(noisy_case1)

# result = []
# for i in range(10):
#     reconstruct = inference(noisy_case1[i], model, projection, attacker_model, noise = noise_embedding)[0]  # only 1 sample at a time
#     result.append({
#       "original": noisy_case1[i],
#       "reconstruct": reconstruct,
#       "sbert_cosine": sbert_cosine(case1[i], reconstruct),
#       "bertscore_f1": bertscore_f1([case1[i]], [reconstruct]),
#       "rouge_l": rouge_l(case1[i], reconstruct)
#     })


# result_df = pd.DataFrame(result)

# result_base = []

# for i in range(10):
#     reconstruct = inference(case1[i], model, projection, attacker_model)[0]  # only 1 sample at a time
#     result_base.append({
#       "original": case1[i],
#       "reconstruct": reconstruct,
#       "sbert_cosine": sbert_cosine(case1[i], reconstruct),
#       "bertscore_f1": bertscore_f1([case1[i]], [reconstruct]),
#       "rouge_l": rouge_l(case1[i], reconstruct)
#     })

# result_base_df = pd.DataFrame(result_base)


In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

In [None]:
!pip install xlsxwriter

In [None]:

import os

path = f'models/result-desalted'
# "yes" if os.path.exists(path) else "no"

with pd.ExcelWriter(os.path.join(path,"training.xlsx"), engine="xlsxwriter", mode='w') as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

# with pd.ExcelWriter(os.path.join(path,"swithoutnoise-training.xlsx"), engine="xlsxwriter") as writer:
#     result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

In [None]:
### Conversational data similar to the training data
sampled_convo = samples = [
 "i love painting and hiking , spring is my favorite season honestly .",
 "what would you do if you didn t have to worry about money for a week ?",
 "i wrapped my foot after the hike , it was nothing serious though",
 "yo you dont know who youre messing with , dont call me any names man",
 "that s great ! you should send her a short poem or something",
 "i rarely travel but my parents still take me to family reunions sometimes",
 "how did it go ? sounds like it must have been a lot of fun",
 "i m doing well thanks ! i teach art at a local middle school",
 "i m okay . how have you been lately ?",
 "that sounds lovely ! i m a big fan of sunny days and ice cream"
]

case2 = list(sampled_convo)

# noisy_case2 = [
#     s * times if s.strip().endswith('.') else (s + '.') * times for s in case2
# ]

noisy_case2 = [
    s + ' ' + noise if s.strip().endswith('.') else (s + '. ') + noise for s in case2
]

result = []
for i in range(10):
    reconstruct = inference(noisy_case2[i], model, projection, attacker_model, noise = noise_embedding)[0]  # only 1 sample at a time
    result.append({
      "original": noisy_case2[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case2[i], reconstruct),
      "bertscore_f1": bertscore_f1([case2[i]], [reconstruct]),
      "rouge_l": rouge_l(case2[i], reconstruct)
    })


result_df = pd.DataFrame(result)

result_base = []

for i in range(10):
    reconstruct = inference(case2[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result_base.append({
      "original": case2[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case2[i], reconstruct),
      "bertscore_f1": bertscore_f1([case2[i]], [reconstruct]),
      "rouge_l": rouge_l(noisy_case2[i], reconstruct)
    })

result_base_df = pd.DataFrame(result_base)


In [None]:
import os

path = f'models/result-desalted'
# "yes" if os.path.exists(path) else "no"

with pd.ExcelWriter(os.path.join(path,"similar.xlsx"), engine="xlsxwriter") as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

In [None]:
### Non coversational general data

synthetic_samples = [
    "The tallest building in the city casts a shadow over the entire park at noon.",
    "I enjoy experimenting with new recipes in the kitchen on weekends.",
    "Quantum computing has the potential to revolutionize encryption and data processing.",
    "The rainforest is home to thousands of unique plant and animal species.",
    "Astronomers discovered a new exoplanet orbiting a star 300 light-years away.",
    "I started painting landscapes to capture the beauty of the mountains in spring.",
    "Electric cars are becoming more popular as battery technology improves.",
    "Meditation can reduce stress and improve focus when practiced regularly.",
    "The software update fixed several bugs but introduced a new glitch in the UI.",
    "During winter, the lake freezes over and attracts many ice-skating enthusiasts."
]

case3 = list(synthetic_samples)

# noisy_case3 = [
#     s * times if s.strip().endswith('.') else (s + '.') * times for s in case3
# ]

noisy_case3 = [
    s + ' ' + noise if s.strip().endswith('.') else (s + '. ') + noise for s in case3
]

result = []
for i in range(10):
    reconstruct = inference(noisy_case3[i], model, projection, attacker_model, noise = noise_embedding)[0]  # only 1 sample at a time
    result.append({
      "original": noisy_case3[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case3[i], reconstruct),
      "bertscore_f1": bertscore_f1([case3[i]], [reconstruct]),
      "rouge_l": rouge_l(case3[i], reconstruct)
    })


result_df = pd.DataFrame(result)

result_base = []

for i in range(10):
    reconstruct = inference(case3[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result_base.append({
      "original": case3[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case3[i], reconstruct),
      "bertscore_f1": bertscore_f1([case3[i]], [reconstruct]),
      "rouge_l": rouge_l(noisy_case3[i], reconstruct)
    })

result_base_df = pd.DataFrame(result_base)

In [None]:
import os

path = f'models/result-desalted'
# "yes" if os.path.exists(path) else "no"

with pd.ExcelWriter(os.path.join(path,"outside.xlsx"), engine="xlsxwriter") as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

### Noise inside the sentence

In [None]:
noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City and some one said: "
### Data from the training sets -- sample 10 random ones
# for reproducibility
np.random.seed(42)
random.seed(42)

# noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."
sampled = np.random.choice(sent_list, size=10, replace=False)  # sampled is an array of 10 items
case1 = list(sampled)


noisy_case1 = [
    noise + s for s in case1
]

result = []
for i in range(10):
    reconstruct = inference(noisy_case1[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result.append({
      "original": noisy_case1[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case1[i], reconstruct),
      "bertscore_f1": bertscore_f1([case1[i]], [reconstruct]),
      "rouge_l": rouge_l(case1[i], reconstruct)
    })


result_df = pd.DataFrame(result)

result_base = []

for i in range(10):
    reconstruct = inference(case1[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result_base.append({
      "original": case1[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case1[i], reconstruct),
      "bertscore_f1": bertscore_f1([case1[i]], [reconstruct]),
      "rouge_l": rouge_l(case1[i], reconstruct)
    })

result_base_df = pd.DataFrame(result_base)


In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

In [None]:
with pd.ExcelWriter("sentences.xlsx", engine="xlsxwriter") as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

print("Saved to sentences.xlsx")

In [None]:
### Conversational data similar to the training data
sampled_convo = samples = [
 "i love painting and hiking , spring is my favorite season honestly .",
 "what would you do if you didn t have to worry about money for a week ?",
 "i wrapped my foot after the hike , it was nothing serious though",
 "yo you dont know who youre messing with , dont call me any names man",
 "that s great ! you should send her a short poem or something",
 "i rarely travel but my parents still take me to family reunions sometimes",
 "how did it go ? sounds like it must have been a lot of fun",
 "i m doing well thanks ! i teach art at a local middle school",
 "i m okay . how have you been lately ?",
 "that sounds lovely ! i m a big fan of sunny days and ice cream"
]

case2 = list(sampled_convo)

noisy_case2 = [
    noise + s for s in case2
]

result = []
for i in range(10):
    reconstruct = inference(noisy_case2[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result.append({
      "original": noisy_case2[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case2[i], reconstruct),
      "bertscore_f1": bertscore_f1([case2[i]], [reconstruct]),
      "rouge_l": rouge_l(case2[i], reconstruct)
    })


result_df = pd.DataFrame(result)

result_base = []

for i in range(10):
    reconstruct = inference(case2[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result_base.append({
      "original": case2[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case2[i], reconstruct),
      "bertscore_f1": bertscore_f1([case2[i]], [reconstruct]),
      "rouge_l": rouge_l(noisy_case2[i], reconstruct)
    })

result_base_df = pd.DataFrame(result_base)


In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

In [None]:
with pd.ExcelWriter("sentences.xlsx", engine="xlsxwriter") as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

print("Saved to sentences.xlsx")

In [None]:
### Non coversational general data

synthetic_samples = [
    "The tallest building in the city casts a shadow over the entire park at noon.",
    "I enjoy experimenting with new recipes in the kitchen on weekends.",
    "Quantum computing has the potential to revolutionize encryption and data processing.",
    "The rainforest is home to thousands of unique plant and animal species.",
    "Astronomers discovered a new exoplanet orbiting a star 300 light-years away.",
    "I started painting landscapes to capture the beauty of the mountains in spring.",
    "Electric cars are becoming more popular as battery technology improves.",
    "Meditation can reduce stress and improve focus when practiced regularly.",
    "The software update fixed several bugs but introduced a new glitch in the UI.",
    "During winter, the lake freezes over and attracts many ice-skating enthusiasts."
]

case3 = list(synthetic_samples)

noisy_case3 = [
    noise + s for s in case3
]

result = []
for i in range(10):
    reconstruct = inference(noisy_case3[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result.append({
      "original": noisy_case3[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case3[i], reconstruct),
      "bertscore_f1": bertscore_f1([case3[i]], [reconstruct]),
      "rouge_l": rouge_l(case3[i], reconstruct)
    })


result_df = pd.DataFrame(result)

result_base = []

for i in range(10):
    reconstruct = inference(case3[i], model, projection, attacker_model)[0]  # only 1 sample at a time
    result_base.append({
      "original": case3[i],
      "reconstruct": reconstruct,
      "sbert_cosine": sbert_cosine(case3[i], reconstruct),
      "bertscore_f1": bertscore_f1([case3[i]], [reconstruct]),
      "rouge_l": rouge_l(noisy_case3[i], reconstruct)
    })

result_base_df = pd.DataFrame(result_base)

In [None]:
print("Sentences with noise")
display(result_df)
print("\n\nSentences without noise")
display(result_base_df)

In [None]:
with pd.ExcelWriter("sentences.xlsx", engine="xlsxwriter") as writer:
    result_df.to_excel(writer, sheet_name="With Noise", index=False)
    result_base_df.to_excel(writer, sheet_name="Without Noise", index=False)

print("Saved to sentences.xlsx")

### Differetial Privacy & FGSM

In [117]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def Differential(embedding, epsilon=10.0, sensitivity=1.0, renormalize=False, tensor=True):
    """
    Applies Differential Privacy to an embedding vector using the Laplace Mechanism.

    Args:
        embedding (np.array): The original embedding vector (1D or 2D array).
        epsilon (float): The privacy budget. Lower value = More privacy (more noise).
                         Higher value = Less privacy (less noise).
        sensitivity (float): The maximum amount the embedding can change.
                             Usually 1.0 or 2.0 for normalized embeddings.

    Returns:
        np.array: The noisy, private embedding.
    """
    embedding = embedding.cpu().numpy()
    # Calculate the scale of the noise (b = sensitivity / epsilon)
    scale = sensitivity / epsilon

    # Generate Laplacian noise with the same shape as the embedding
    noise = np.random.laplace(loc=0.0, scale=scale, size=embedding.shape)

    # Add noise to the original embedding
    private_embedding = embedding + noise

    if renormalize:
        # Renormalize the embedding to have unit length
        private_embedding /= np.linalg.norm(private_embedding)
    if tensor:
       private_embedding = torch.tensor(private_embedding,dtype=torch.float32).to(device)

    return private_embedding




import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, T5EncoderModel, GPTNeoXForCausalLM

# ==========================================
# 1. SETUP: The "Victim" Model (Sentence-T5)
# ==========================================
# from sentence_transformers import SentenceTransformer

# victim_model = SentenceTransformer('sentence-transformers/sentence-t5-base')
# victim_model.eval() # Good practice, though .encode() handles this internally

def get_clean_embedding(text_list, victim_model):
    """
    Uses the easy library to get the official embeddings.
    """
    victim_model.eval()
    embeddings = victim_model.encode(text_list, convert_to_tensor=True)
    return embeddings


# ==========================================
# 2. THE ATTACKER: Pythia-160m Inverter
# ==========================================
class PythiaAttacker(nn.Module):
    def __init__(self, input_dim=768, device='cpu'):
        super().__init__()
        self.device = device
        print("Loading Pythia-160m...")
        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
        # Ensure padding token exists for Pythia (GPT-NeoX)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-160m").to(device)

        # Projector: Maps T5 embedding dim (768) to Pythia embedding dim (768)
        # Even if dims are same, this layer learns to align the latent spaces
        self.pythia_dim = self.model.config.hidden_size
        self.projector = nn.Linear(input_dim, self.pythia_dim).to(device)

    def forward(self, sentence_embedding, target_text_list=None):
        """
        Args:
            sentence_embedding: (batch, t5_dim)
            target_text_list: List of strings (the original text)
        """
        # 1. Project the T5 embedding to Pythia's space
        # Shape: (batch, 1, pythia_dim) -> It becomes the first "virtual token"
        prompt_embeds = self.projector(sentence_embedding).unsqueeze(1)

        # 2. Prepare Target Labels (if training/evaluating)
        if target_text_list:
            # Tokenize targets with Pythia's tokenizer
            target_inputs = self.tokenizer(
                target_text_list,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=64
            ).to(self.device)

            input_ids = target_inputs['input_ids']
            attention_mask = target_inputs['attention_mask']

            # Get embeddings of the real text
            # inputs_embeds: (batch, seq_len, pythia_dim)
            text_embeds = self.model.gpt_neox.embed_in(input_ids)

            # 3. Concatenate: [Projected_Emb + Text_Embeds]
            # We prepend the sentence embedding to the sequence
            inputs_embeds = torch.cat([prompt_embeds, text_embeds], dim=1)

            # Extend attention mask (add 1 for the prepended token)
            batch_size = input_ids.shape[0]
            prefix_mask = torch.ones((batch_size, 1), device=self.device)
            attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

            # 4. Forward Pass through Pythia
            outputs = self.model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
            return outputs.logits, input_ids

        return None, None

# ==========================================
# 3. THE DEFENSE: FGSM
# ==========================================
def fgsm_protect(sentence_embedding, text_list, attacker_model, epsilon=0.1):
    """
    Performs Gradient Ascent on the Reconstruction Loss.
    """
    # 1. Setup gradient tracking on the embedding
    protected_emb = sentence_embedding.clone().detach()
    protected_emb.requires_grad = True

    # 2. Forward pass through Attacker
    logits, target_ids = attacker_model(protected_emb, text_list)

    # 3. Align Logits and Labels
    # Pythia outputs logits for [VirtualToken, Token1, Token2, ...]
    # We want to predict [Token1, Token2, ...] given previous history.
    # Shift logits: Remove last logit, Start from index 0 (prediction for first real token)
    # The logits at index 0 (corresponding to VirtualToken) predict Token1 (target_ids[:, 0])

    # logits shape: (batch, seq_len + 1, vocab)
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = target_ids.contiguous()

    # 4. Calculate Loss
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    # 5. Backward & Perturb (in the direction that maximizes noise)
    attacker_model.zero_grad()
    loss.backward()

    # Add noise to maximize loss
    noise = epsilon * protected_emb.grad.sign()
    final_emb = sentence_embedding + noise

    return final_emb.detach()



#===== load the proxy model ======
import torch
import os

# Define the device (same as during training)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the attacker model (ensure PythiaAttacker class is defined)
loaded_attacker = PythiaAttacker(input_dim=768, device=device).to(device)

# Load the saved state dictionary
save_directory = 'FGSM'
model_path = os.path.join(save_directory, 'attacker_e5v2_best.pth') # Assuming it was saved as 'attacker_model_best.pth'
loaded_attacker.load_state_dict(torch.load(model_path, map_location=device))

Loading Pythia-160m...


<All keys matched successfully>

### Phase 2 - Proving that adding salt still work with some kind of downstream task
Use only the methodology of adding salt as a separated sentence.
Because adding salt will change the output embedding vector, so we need to prove that the change will still work for any downstream task that receives the salted embedding vectors as input.

### Clustering
- Choose any popular clustering algorithm, ex: K-means clustering
- Cluster raw data embedding vector (without salt) to some groups (10, 20, 50, 100)
- For each raw data item, verify if the K-means model will output the same group for two inputs: raw data embedding vector, salted data embedding vector.

### Similarity search
- Generate embedding vector of all raw data called embedding vector database
- For each raw data item, generate a salted version. Then do a similarity search to look for the best matched item in the embedding vector database.
- Proving that the best match is still the raw data item even that we input the salted version for looking up


In [118]:
# Load the model
device_1 = "cuda"
name = 'intfloat/e5-base-v2'
# name = 'all-distilroberta-v1'
# name = 'sentence-t5-base'
model = SentenceTransformer(name, device=device_1)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
sent_list_x2 = [s * 2 if s.strip().endswith('.') else s + '. ' + s + '.' for s in sent_list]

embeddings_raw = model.encode(sent_list_x2, convert_to_tensor=True, show_progress_bar=True, batch_size=128)
embeddings_raw.size()

In [119]:
model_name = 'e5-base-v2'

In [120]:
f'kmeans/{model_name}/embeddings.npy'

'kmeans/e5-base-v2/embeddings.npy'

In [None]:
# embeddings_np = embeddings_raw.detach().cpu().numpy()
# np.save(f'kmeans/{model_name}/embeddings.npy', embeddings_np)

In [121]:
embeddings_np = np.load(f'kmeans/{model_name}/embeddings.npy')  # embeddings on the raw data

In [None]:
noised_embeddings = np.load(f'kmeans/{model_name}/noised_embeddings.npy')  # embeddings on the salted data

In [None]:
embeddings_x2 = embeddings_np * 2

In [None]:
"""embeddings are normalized to length one"""

### FOR TRAINING CLUSTESR ONLY

import numpy as np
from sklearn.cluster import KMeans
import joblib


cluster_counts = [10, 20, 50, 100]

for k in cluster_counts:
    print(f"\n=== K-Means with {k} clusters on raw data===")


    kmeans = KMeans(
        n_clusters=k,
        n_init=10,
        random_state=42,
        verbose=True
    )
    kmeans.fit(embeddings_x2)

    model_path = f"kmeans/{model_name}/result_x2/{k}_clusters.joblib"
    joblib.dump(kmeans, model_path)
    print(f"Saved model: {model_path}")

In [None]:
"""Testing on k-means models"""

In [122]:
import random

# noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."### Data from the training sets -- sample 10 random ones
# noise = "HCMUT is a university"
noise = "HCMUT is a university specializing in technology and engineering."
# for reproducibility
np.random.seed(42)
random.seed(42)

# noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."
sampled = np.random.choice(sent_list, size=20, replace=False)  # sampled is an array of 10 items
samples = list(sampled)

noise_embedding = model.encode(noise, convert_to_numpy=True)

In [123]:
samples

[np.str_('imagine a marching bad stampede . now my knee is almost all metal .'),
 np.str_('ahh i see , do what you love , i want to be a botanist , i love roses .'),
 np.str_('oh okay . are you in college ?'),
 np.str_('you are not reading what i type'),
 np.str_('negative , no siblings . ma and pa died in a plane crash long ago . you ?'),
 np.str_('i am fine thank you how is your day going ?'),
 np.str_('sounds about right ! what tv shows do you like ?'),
 np.str_('hi , want to be friends ?'),
 np.str_('hello . how are you doing today ?'),
 np.str_('i love dogs too ! so much .'),
 np.str_('oh i bet . i live walking distance to my office . lol so not bad at all'),
 np.str_('i love water sports . it is fun to get soaked'),
 np.str_('besides listening to music i also like to knit from time to time'),
 np.str_('ok , just try to cut back a little at the time'),
 np.str_('we met at school . we both drive mustangs .'),
 np.str_('hello ! how are you tonight ?'),
 np.str_('its a beautiful day 

In [124]:
noise_embedding.shape

(768,)

In [125]:
place = 'FGSM'

In [126]:
import joblib

k10 = joblib.load(f'kmeans/{model_name}/10_clusters.joblib')

In [127]:
import pandas as pd



res_df = pd.DataFrame(columns=['sample', 'clusters_id1', 'salted sample', 'clusters_id2'])

# 10 clusters model
k10 = joblib.load(f'kmeans/{model_name}/10_clusters.joblib')
"""predict 20 raw samples"""
embed_samples = model.encode(samples, convert_to_numpy=True)
labels1 = k10.predict(embed_samples)
res_df['sample'] = samples
res_df['clusters_id1'] = labels1
"""predict 10 salted samples"""
# salted_samples = [
#     s.strip() + ' ' + noise if s.strip().endswith('.') else (s + '. ' + noise) for s in samples
# ]
# embed_salted_samples = model.encode(salted_samples, convert_to_numpy=True)
# embed_salted_samples_mod = embed_salted_samples - noise_embedding

# embedx2 = embed_samples * 2
salted_samples = samples
embed_salted_samples = model.encode(salted_samples, convert_to_tensor=True)
"""We have to apply individually because the function will use the average batch gradient instead, and that is not what we want"""
embed_salted_samples_mod = [
    fgsm_protect(embed_salted_samples[i].unsqueeze(0), salted_samples[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
    for i in range(20)]
"""Due to floating point precision there will be some error and some operations may not be exactly 0"""
labels2 = k10.predict(np.array(embed_salted_samples_mod))
# res_df['embeddings x2']
res_df['salted sample'] = salted_samples
res_df['clusters_id2'] = labels2
res_df
# embed_salted_samples_mod[0].shape

Unnamed: 0,sample,clusters_id1,salted sample,clusters_id2
0,imagine a marching bad stampede . now my knee ...,1,imagine a marching bad stampede . now my knee ...,1
1,"ahh i see , do what you love , i want to be a ...",3,"ahh i see , do what you love , i want to be a ...",3
2,oh okay . are you in college ?,9,oh okay . are you in college ?,9
3,you are not reading what i type,1,you are not reading what i type,2
4,"negative , no siblings . ma and pa died in a p...",1,"negative , no siblings . ma and pa died in a p...",1
5,i am fine thank you how is your day going ?,2,i am fine thank you how is your day going ?,4
6,sounds about right ! what tv shows do you like ?,9,sounds about right ! what tv shows do you like ?,8
7,"hi , want to be friends ?",9,"hi , want to be friends ?",4
8,hello . how are you doing today ?,4,hello . how are you doing today ?,4
9,i love dogs too ! so much .,7,i love dogs too ! so much .,7


In [44]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.9


In [128]:
os.path.exists(f"kmeans/{model_name}/{place}")

True

In [129]:
with pd.ExcelWriter(f"kmeans/{model_name}/{place}/samples_20_k10.xlsx", engine="xlsxwriter") as writer:
    res_df.to_excel(writer, sheet_name="Res", index=False)

print("Saved to samples_20_k10.xlsx")

Saved to samples_20_k10.xlsx


In [130]:
np.random.seed(None)
random.seed(None)

"""Measures general accuracy of the kmeans averaged over 10 batches of 1000 samples"""
accuracy = {}

for i in range(10):
  ls = np.random.choice(sent_list, size=1000, replace=False)
  embed_samples = model.encode(ls, convert_to_numpy=True)
  labels1 = k10.predict(embed_samples)

  # salted_ls = [
  #   s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in ls
  # ]
  salted_ls = ls
  salted_embed = model.encode(salted_ls, convert_to_tensor=True)
  # salted_embed_mod = salted_embed - noise_embedding
  salted_embed_mod = [
      fgsm_protect(salted_embed[i].unsqueeze(0), salted_ls[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
      for i in range(1000)
  ]
  labels2 = k10.predict(np.array(salted_embed_mod))

  count = 0
  for j in range(1000):
    if labels1[j] == labels2[j]: count += 1

  accuracy[f'batch {i}'] = count / 1000

accuracy


{'batch 0': 0.386,
 'batch 1': 0.436,
 'batch 2': 0.415,
 'batch 3': 0.448,
 'batch 4': 0.425,
 'batch 5': 0.426,
 'batch 6': 0.437,
 'batch 7': 0.404,
 'batch 8': 0.445,
 'batch 9': 0.426}

In [131]:
import pickle

with open(f'kmeans/{model_name}/{place}/10_clusters.pkl', 'wb') as f:
    pickle.dump(accuracy, f)

with pd.ExcelWriter(f"kmeans/{model_name}/{place}/10_clusters.xlsx", engine="xlsxwriter") as writer:
    pd.DataFrame(list(accuracy.items())).to_excel(writer, sheet_name="Res", index=False)


In [132]:
# 20 clusters model
k20 = joblib.load(f'kmeans/{model_name}/20_clusters.joblib')

res_df = pd.DataFrame(columns=['sample', 'clusters_id1', 'salted sample', 'clusters_id2'])


"""predict 10 raw samples"""
embed_samples = model.encode(samples, convert_to_numpy=True)
labels1 = k20.predict(embed_samples)
res_df['sample'] = samples
res_df['clusters_id1'] = labels1
"""predict 10 salted samples"""
# salted_samples = [
#     s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in samples
# ]
salted_samples = samples
embed_salted_samples = model.encode(salted_samples, convert_to_tensor=True)
# embed_salted_samples_mod = embed_salted_samples - noise_embedding
# embed_salted_samples_mod = Differential(embed_salted_samples, tensor=False)
# labels2 = k20.predict(embed_salted_samples_mod.astype(np.float32))
embed_salted_samples_mod = [
    fgsm_protect(embed_salted_samples[i].unsqueeze(0), salted_samples[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
    for i in range(20)]
"""Due to floating point precision there will be some error and some operations may not be exactly 0"""
labels2 = k20.predict(np.array(embed_salted_samples_mod))
res_df['salted sample'] = salted_samples
res_df['clusters_id2'] = labels2
res_df

Unnamed: 0,sample,clusters_id1,salted sample,clusters_id2
0,imagine a marching bad stampede . now my knee ...,13,imagine a marching bad stampede . now my knee ...,13
1,"ahh i see , do what you love , i want to be a ...",19,"ahh i see , do what you love , i want to be a ...",19
2,oh okay . are you in college ?,18,oh okay . are you in college ?,4
3,you are not reading what i type,13,you are not reading what i type,2
4,"negative , no siblings . ma and pa died in a p...",16,"negative , no siblings . ma and pa died in a p...",4
5,i am fine thank you how is your day going ?,2,i am fine thank you how is your day going ?,3
6,sounds about right ! what tv shows do you like ?,5,sounds about right ! what tv shows do you like ?,6
7,"hi , want to be friends ?",8,"hi , want to be friends ?",8
8,hello . how are you doing today ?,3,hello . how are you doing today ?,3
9,i love dogs too ! so much .,15,i love dogs too ! so much .,15


In [133]:
with pd.ExcelWriter(f"kmeans/{model_name}/{place}/samples_20_k20.xlsx", engine="xlsxwriter") as writer:
    res_df.to_excel(writer, sheet_name="Res", index=False)

print("Saved to samples_20_k20.xlsx")

Saved to samples_20_k20.xlsx


In [134]:
"""Measures general accuracy of the kmeans averaged over 10 batches of 1000 samples"""
accuracy = {}

num = 1000

for i in range(10):
  ls = np.random.choice(sent_list, size=num, replace=False)
  embed_samples = model.encode(ls, convert_to_numpy=True)
  labels1 = k20.predict(embed_samples)

  # salted_ls = [
  #   s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in ls
  # ]
  salted_ls = ls
  salted_embed = model.encode(salted_ls, convert_to_tensor=True)
  # salted_embed_mod = salted_embed - noise_embedding
  salted_embed_mod = [
      fgsm_protect(salted_embed[i].unsqueeze(0), salted_ls[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
      for i in range(1000)
  ]
  # salted_embed_mod = Differential(salted_embed, tensor=False)
  labels2 = k20.predict(np.array(salted_embed_mod))

  count = 0
  # for a, b in zip(labels1, labels2):
  #   print(a, "  ", b)
  # print(a)
  for j in range(num):
    if labels1[j] == labels2[j]: count += 1

  accuracy[f'batch {i}'] = count / num

display(accuracy)

{'batch 0': 0.349,
 'batch 1': 0.373,
 'batch 2': 0.382,
 'batch 3': 0.395,
 'batch 4': 0.336,
 'batch 5': 0.368,
 'batch 6': 0.373,
 'batch 7': 0.334,
 'batch 8': 0.335,
 'batch 9': 0.349}

In [135]:
import pickle

with open(f'kmeans/{model_name}/{place}/20_clusters.pkl', 'wb') as f:
    pickle.dump(accuracy, f)

with pd.ExcelWriter(f"kmeans/{model_name}/{place}/20_clusters.xlsx", engine="xlsxwriter") as writer:
    pd.DataFrame(list(accuracy.items())).to_excel(writer, sheet_name="Res", index=False)

In [136]:
# 50 clusters model
k50 = joblib.load(f'kmeans/{model_name}/50_clusters.joblib')


res_df = pd.DataFrame(columns=['sample', 'clusters_id1', 'salted sample', 'clusters_id2'])


"""predict 10 raw samples"""
embed_samples = model.encode(samples, convert_to_numpy=True)
labels1 = k50.predict(embed_samples)
res_df['sample'] = samples
res_df['clusters_id1'] = labels1
"""predict 10 salted samples"""
# salted_samples = [
#     s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in samples
# ]
salted_samples = samples
embed_salted_samples = model.encode(salted_samples, convert_to_tensor=True)
# embed_salted_samples_mod = embed_salted_samples - noise_embedding
# embed_salted_samples_mod = Differential(embed_salted_samples, tensor=False)
# labels2 = k50.predict(embed_salted_samples_mod.astype(np.float32))
embed_salted_samples_mod = [
    fgsm_protect(embed_salted_samples[i].unsqueeze(0), salted_samples[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
    for i in range(20)]
"""Due to floating point precision there will be some error and some operations may not be exactly 0"""
labels2 = k50.predict(np.array(embed_salted_samples_mod))
res_df['salted sample'] = salted_samples
res_df['clusters_id2'] = labels2
res_df

Unnamed: 0,sample,clusters_id1,salted sample,clusters_id2
0,imagine a marching bad stampede . now my knee ...,29,imagine a marching bad stampede . now my knee ...,29
1,"ahh i see , do what you love , i want to be a ...",15,"ahh i see , do what you love , i want to be a ...",15
2,oh okay . are you in college ?,11,oh okay . are you in college ?,18
3,you are not reading what i type,34,you are not reading what i type,23
4,"negative , no siblings . ma and pa died in a p...",42,"negative , no siblings . ma and pa died in a p...",18
5,i am fine thank you how is your day going ?,23,i am fine thank you how is your day going ?,24
6,sounds about right ! what tv shows do you like ?,43,sounds about right ! what tv shows do you like ?,26
7,"hi , want to be friends ?",4,"hi , want to be friends ?",9
8,hello . how are you doing today ?,24,hello . how are you doing today ?,4
9,i love dogs too ! so much .,49,i love dogs too ! so much .,49


In [137]:
with pd.ExcelWriter(f"kmeans/{model_name}/{place}/samples_20_k50.xlsx", engine="xlsxwriter") as writer:
    res_df.to_excel(writer, sheet_name="Res", index=False)

print("Saved to samples_20_k50_short.xlsx")

Saved to samples_20_k50_short.xlsx


In [138]:
"""Measures general accuracy of the kmeans averaged over 10 batches of 1000 samples"""
accuracy = {}

num = 1000

for i in range(10):
  ls = np.random.choice(sent_list, size=num, replace=False)
  embed_samples = model.encode(ls, convert_to_numpy=True)
  labels1 = k50.predict(embed_samples)

  # salted_ls = [
  #   s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in ls
  # ]
  salted_ls = ls
  salted_embed = model.encode(salted_ls, convert_to_tensor=True)
  # salted_embed_mod = salted_embed - noise_embedding
  salted_embed_mod = [
      fgsm_protect(salted_embed[i].unsqueeze(0), salted_ls[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
      for i in range(1000)
  ]
  # salted_embed_mod = Differential(salted_embed, tensor=False)
  labels2 = k50.predict(np.array(salted_embed_mod))

  count = 0
  # for a, b in zip(labels1, labels2):
  #   print(a, "  ", b)
  # print(a)
  for j in range(num):
    if labels1[j] == labels2[j]: count += 1

  accuracy[f'batch {i}'] = count / num

accuracy


{'batch 0': 0.294,
 'batch 1': 0.298,
 'batch 2': 0.295,
 'batch 3': 0.286,
 'batch 4': 0.272,
 'batch 5': 0.297,
 'batch 6': 0.295,
 'batch 7': 0.332,
 'batch 8': 0.293,
 'batch 9': 0.296}

In [139]:
import pickle

with open(f'kmeans/{model_name}/{place}/50_clusters.pkl', 'wb') as f:
    pickle.dump(accuracy, f)

with pd.ExcelWriter(f"kmeans/{model_name}/{place}/50_clusters.xlsx", engine="xlsxwriter") as writer:
    pd.DataFrame(list(accuracy.items())).to_excel(writer, sheet_name="Res", index=False)

In [140]:
# 100 clusters model
k100 = joblib.load(f'kmeans/{model_name}/100_clusters.joblib')


res_df = pd.DataFrame(columns=['sample', 'clusters_id1', 'salted sample', 'clusters_id2'])


"""predict 10 raw samples"""
embed_samples = model.encode(samples, convert_to_numpy=True)
labels1 = k100.predict(embed_samples)
res_df['sample'] = samples
res_df['clusters_id1'] = labels1
"""predict 10 salted samples"""
# salted_samples = [
#     s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in samples
# ]
salted_samples = samples
embed_salted_samples = model.encode(salted_samples, convert_to_tensor=True)
# embed_salted_samples_mod = embed_salted_samples - noise_embedding
# embed_salted_samples_mod = Differential(embed_salted_samples, tensor=False)
# labels2 = k100.predict(embed_salted_samples_mod.astype(np.float32))
embed_salted_samples_mod = [
    fgsm_protect(embed_salted_samples[i].unsqueeze(0), salted_samples[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
    for i in range(20)]
"""Due to floating point precision there will be some error and some operations may not be exactly 0"""
labels2 = k100.predict(np.array(embed_salted_samples_mod))
res_df['salted sample'] = salted_samples
res_df['clusters_id2'] = labels2
res_df

Unnamed: 0,sample,clusters_id1,salted sample,clusters_id2
0,imagine a marching bad stampede . now my knee ...,22,imagine a marching bad stampede . now my knee ...,42
1,"ahh i see , do what you love , i want to be a ...",65,"ahh i see , do what you love , i want to be a ...",86
2,oh okay . are you in college ?,60,oh okay . are you in college ?,60
3,you are not reading what i type,27,you are not reading what i type,28
4,"negative , no siblings . ma and pa died in a p...",50,"negative , no siblings . ma and pa died in a p...",16
5,i am fine thank you how is your day going ?,69,i am fine thank you how is your day going ?,72
6,sounds about right ! what tv shows do you like ?,6,sounds about right ! what tv shows do you like ?,11
7,"hi , want to be friends ?",20,"hi , want to be friends ?",53
8,hello . how are you doing today ?,48,hello . how are you doing today ?,38
9,i love dogs too ! so much .,31,i love dogs too ! so much .,22


In [141]:
with pd.ExcelWriter(f"kmeans/{model_name}/{place}/samples_20_k100.xlsx", engine="xlsxwriter") as writer:
    res_df.to_excel(writer, sheet_name="Res", index=False)

print("Saved to samples_20_k100_short.xlsx")

Saved to samples_20_k100_short.xlsx


In [142]:
"""Measures general accuracy of the kmeans averaged over 10 batches of 1000 samples"""
accuracy = {}

num = 1000

for i in range(10):
  ls = np.random.choice(sent_list, size=num, replace=False)
  embed_samples = model.encode(ls, convert_to_numpy=True)
  labels1 = k100.predict(embed_samples)

  # salted_ls = [
  #   s.strip() + ' ' + noise if s.strip().endswith('.') else (s.strip() + '. ' + noise) for s in ls
  # ]
  salted_ls = ls
  salted_embed = model.encode(salted_ls, convert_to_tensor=True)
  # salted_embed_mod = salted_embed - noise_embedding
  salted_embed_mod = [
      fgsm_protect(salted_embed[i].unsqueeze(0), salted_ls[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
      for i in range(1000)
  ]
  # salted_embed_mod = Differential(salted_embed, tensor=False)
  labels2 = k100.predict(np.array(salted_embed_mod))

  count = 0
  # for a, b in zip(labels1, labels2):
  #   print(a, "  ", b)
  # print(a)
  for j in range(num):
    if labels1[j] == labels2[j]: count += 1

  accuracy[f'batch {i}'] = count / num

accuracy


{'batch 0': 0.248,
 'batch 1': 0.269,
 'batch 2': 0.272,
 'batch 3': 0.271,
 'batch 4': 0.267,
 'batch 5': 0.268,
 'batch 6': 0.244,
 'batch 7': 0.267,
 'batch 8': 0.264,
 'batch 9': 0.222}

In [143]:
import pickle

with open(f'kmeans/{model_name}/{place}/100_clusters.pkl', 'wb') as f:
    pickle.dump(accuracy, f)

with pd.ExcelWriter(f"kmeans/{model_name}/{place}/100_clusters.xlsx", engine="xlsxwriter") as writer:
    pd.DataFrame(list(accuracy.items())).to_excel(writer, sheet_name="Res", index=False)

Intial impression on clustering: It's not very good

---
After adding some salt into the sentences, embeddings matching performace based dropped remarkably.

1. Setence-t5-base  
    - K-means with 10 clusters achieving the highest average score (over 1000 samples) of about 70% accuracy, while higher clusters number like 20, 50, 100 all achieve some where between 50% - 55% accuracy

2. e5-base-v2
    - K-means was apply with the same clusters set 10, 20, 50, 100, using the same benchmark this encoder achieves the highhest score of 55% accuracy for cluster 10, and degrade gradually as the number of clusters increase: 20 (33%), 50 (30%), 100 (25%)

3. all-distilroberta-v1
    - Achieve highest accuracy score of 63%, following are clusters 50 (42%), 20 (41%) and 100 (37%)

Interestinly the result doesn't improve as embeddings quality improve (suppose that e5-base-v2 outputs highest quality embeddings) but instead appears to worsen when a newer and potentially better encoder is applied.

---
A `Hypothesis` could be that newer and better models capture the semantic in more detail, therefore the embeddings of the original and the salted versions also get pushed further apart. In other words the embedding dimensions of modern models are more clearly separated

We can reinforce this hypothesis with some observation. For a particular salt variant like "HCMUT"
- The salted embeddings for better models like `e5-base-v2` have the tendency to move to a specific clusters -- this is more apparent when the cluster numbers are large (This shows that the embedding model is sensitive enough to detect the semantic changes and adapt the embeddings)
- The older and dumber models like `all-distilroberta-v1` outputs a higher inertia value (because of messy embeddings dimension?). However the embeddings of the raw and salted versions lands in the same clusters more often (leading to higher accuracy.). This accuracy is still worse than `Sentence-t5-base` model though.

→ so better embedding models are not always better in our case for matching between raw and salted embeddings. older embeddings model see some performance boost but not always.



---


### Similarity search
Performed on the same 3 models:
- `e3-base-v2`
- `all-distilroberta-v1`
- `Sentence-t5-base`

In [144]:
### Sentence-t5-base embeddings
from sentence_transformers import SentenceTransformer



In [145]:
!pip install faiss-cpu



In [146]:
import faiss
import numpy as np
from tqdm import tqdm
import random

In [147]:
import random


# noise = "HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."### Data from the training sets -- sample 10 random ones
noise = "HCMUT is a university"
# noise = "HCMUT is a university specializing in technology and engineering."

np.random.seed(42)
random.seed(42)

sampled = np.random.choice(sent_list, size=20, replace=False)  # sampled is an array of 10 items
samples = list(sampled)

In [None]:
samples

In [None]:
np.random.seed(None)
random.seed(None)

texts = np.random.choice(sent_list, size = 2, replace=False)

salted = [s.rstrip() + " " + noise if s.strip().endswith(".") else s.rstrip() + ". " + noise for s in texts]
salted

['cool . is that your favorite show ?. HCMUT is a university',
 'interesting do you like to travel ?. HCMUT is a university']

In [148]:
def unique(sent_list):
    """Remove duplicate sentences"""
    seen = set()
    sent_list_unique = []
    for sent in sent_list:
        if sent not in seen:
            seen.add(sent)
            sent_list_unique.append(sent)

    return sent_list_unique

sent_list_unique = unique(sent_list)
print("The list before deduplication:", len(sent_list), "\n\nThe list after deduplication:", len(sent_list_unique))
print("Total rows removed: ", len(sent_list) - len(sent_list_unique))

The list before deduplication: 132694 

The list after deduplication: 126012
Total rows removed:  6682


In [77]:
# Testing the sent_T5 performance
model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cuda')

embeddings_np = np.load('kmeans/Sent_T5/embeddings.npy')
# embeddings_np = model.encode(sent_list_unique, convert_to_numpy=True)
# noise_embedding = model.encode(noise, convert_to_numpy=True)

dim = embeddings_np.shape[1]
index = faiss.IndexFlatIP(dim)  # IP = inner product (cosine after L2 normalization)
index.add(embeddings_np)
print(f"Indexed {index.ntotal} vectors")

Indexed 126012 vectors


In [None]:
"""I've decided that the embeddings will now only contains unique rows"""
np.save('kmeans/Sent_T5/embeddings.npy', embeddings_np)

In [None]:
sent_noised = [s.rstrip() + " " + noise if s.strip().endswith(".") else s.rstrip() + ". " + noise for s in sent_list_unique]
# noised_embeddings = model.encode(sent_noised, convert_to_numpy=True)
# noised_embeddings = np.load('kmeans/Sent_T5/noised_embeddings.npy')
# noised_index = faiss.IndexFlatIP(noised_embeddings.shape[1])  # IP = inner product (cosine after L2 normalization)
# noised_index.add(noised_embeddings)
# print(f"Indexed {noised_index.ntotal} vectors")

In [None]:
"""I've decided that the embeddings will now only contains unique rows"""
np.save('kmeans/Sent_T5/noised_embeddings.npy', noised_embeddings)


In [150]:
def get_accuracy(index, model, tolerance = 1, test_size = 1000):
  global sent_list_unique
  global noise_embedding
  N = len(sent_list_unique)
  idx = np.random.choice(N, size = test_size, replace = False)
  # sent_noised = [sent_list_unique[i].rstrip() + " " + noise if sent_list_unique[i].strip().endswith(".") else sent_list_unique[i].rstrip() + ". " + noise for i in idx]
  sent_noised = [sent_list_unique[i] for i in idx]

  embeddings = model.encode(sent_noised, convert_to_tensor=True)
  # embeddings_mod = embeddings - noise_embedding
  embeddings_mod = [
      fgsm_protect(embeddings[i].unsqueeze(0), sent_noised[i], loaded_attacker, epsilon = 0.1).squeeze(0).cpu().numpy().astype(dtype=np.float32)
      for i in range(test_size)
  ]
  # embeddings_mod = Differential(embeddings, tensor=False)
  D, I = index.search(np.array(embeddings_mod), k = tolerance)

  total = 0
  for id, i in enumerate(I):
    for j in i:
      if j == idx[id]:
        total += 1
        break

  return total / test_size



# """This is for the comparison code, it compares the 2 sets, instead of just checking for the presence of 1 row index"""
# def get_accuracy(index, noised_index, model, sent_list, tolerance=1, test_size=1000, has_embed = False,
#                  OG_embeddings=None, target_embeddings=None):
#     global sent_noised
#     # global noise_embedding  #Ensures thus exists first

#     N = len(sent_list)
#     idx = np.random.choice(N, size=test_size, replace=False)

#     total_match_percentage = 0

#     if has_embed:
#       print("Loading the embeddings directly")
#       clean_embeddings = OG_embeddings[idx]
#       noised_embeddings = target_embeddings[idx]
#     else:
#       print("Encoding the sentences")
#       clean_embeddings = model.encode([sent_list[i] for i in idx])
#       noised_embeddings = model.encode([sent_noised[i] for i in idx])

#     D_clean, I_clean = index.search(clean_embeddings, k=tolerance)
#     D_noised, I_noised = noised_index.search(noised_embeddings, k=tolerance)  #note that the index should be for the embeddings we're using

#     for i in range(test_size):
#         retrieved_clean = set(I_clean[i])
#         retrieved_noised = set(I_noised[i])

#         intersection = len(retrieved_clean & retrieved_noised)
#         match_percentage = intersection / tolerance

#         total_match_percentage += match_percentage

#     # for sample_idx in idx:
#     #     # Get embeddings for both versions
#     #     if not has_embed:
#     #       clean_embedding = model.encode([sent_list[sample_idx]])
#     #       noised_embedding = model.encode([sent_noised[sample_idx]])
#     #     else:
#     #       clean_embedding = OG_embeddings[sample_idx].reshape(1,-1)
#     #       noised_embedding = target_embeddings[sample_idx].reshape(1,-1)
#     #     # Search in respective indices
#     #     D_clean, I_clean = index.search(clean_embedding, k=tolerance)
#     #     D_noised, I_noised = noised_index.search(noised_embedding, k=tolerance)

#     #     # Convert to sets for comparison
#     #     retrieved_clean = set(I_clean[0])
#     #     retrieved_noised = set(I_noised[0])

#     #     # Calculate intersection percentage
#     #     intersection = len(retrieved_clean & retrieved_noised)
#     #     match_percentage = intersection / tolerance

#     #     total_match_percentage += match_percentage

#     # Return average match percentage
#     return total_match_percentage / test_size

In [None]:
"""Check if the noised embeddings we loaded uses the same noise as we're using right now"""
np.all(noised_embeddings[0] - model.encode(sent_noised[0]) < 0.0000001)

In [None]:
indexs = np.random.choice(len(sent_list_unique), 1000, replace=False)
for idx in indexs:
    clean = sent_list_unique[idx]
    noised = sent_noised[idx]
    clean_embed = model.encode([clean], convert_to_numpy=True)
    noised_embed = model.encode([noised], convert_to_numpy=True) - noise_embedding
    faiss.normalize_L2(noised_embed)
    D, I = index.search(clean_embed, k=1)
    D_noised, I_noised = desalted_index.search(noised_embed, k=1)
    if I[0][0] != I_noised[0][0]:
        print(f"Index: {idx}")
        print(f"Clean: {clean} with index {I[0][0]}")
        print(f"Noised: {noised} with index {I_noised[0][0]}")

In [None]:
noise_embedding  = model.encode(noise)
desalted_embeddings =  np.array([embed - noise_embedding for embed in noised_embeddings])

vectors_for_index = desalted_embeddings.copy()

# Normalize the copy in-place
faiss.normalize_L2(vectors_for_index)

desalted_index = faiss.IndexFlatIP(desalted_embeddings.shape[1])
desalted_index.add(vectors_for_index)
print(f"Indexed {desalted_index.ntotal} vectors")

In [None]:
desalted_index.search(model.encode([sent_noised[1]]) - noise_embedding, k=1)

In [None]:
"""I've decided that the embeddings will now only contains unique rows"""
np.save('kmeans/Sent_T5/desalted_embeddings.npy', desalted_embeddings)

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.9


In [80]:
tolerance = [1, 3, 5]
result = []

path = 'FGSM'

# noise_embedding = model.encode(noise, convert_to_numpy=True)

df_save = pd.DataFrame()

# desalted_embeddings_np = np.array(desalted_embeddings) # Convert list of arrays to a single NumPy array

for i in tolerance:
  print(f"Tolerance: {i}")
  result = []
  for j in range(10):
    res = get_accuracy(index, model, i)
    result.append(res)
  df_save[i] = result


with pd.ExcelWriter(f"similarity/{path}/sent-t5-base.xlsx", engine="xlsxwriter") as writer:
    df_save.to_excel(writer, sheet_name="Res", index=False)

print("Saved as an .xlsx file")

Tolerance: 1
Tolerance: 3
Tolerance: 5
Saved as an .xlsx file


In [112]:
model1 = SentenceTransformer('all-distilroberta-v1', device='cuda')
embeddings_np1 = np.load('kmeans/Distilroberta/embeddings.npy')
# embeddings_np1 = model1.encode(sent_list_unique, convert_to_numpy=True)

dim1 = embeddings_np1.shape[1]
index1 = faiss.IndexFlatIP(dim1)  # IP = inner product (cosine after L2 normalization)
index1.add(embeddings_np1)
print(f"Indexed {index1.ntotal} vectors")

noise_embedding = model1.encode(noise, convert_to_numpy=True)

Indexed 126012 vectors


In [None]:
np.save('kmeans/Distilroberta/embeddings.npy', embeddings_np1)

In [None]:
# noised_embeddings1 = model1.encode(sent_noised, convert_to_numpy=True)
noised_embeddings1 = np.load('kmeans/Distilroberta/noised_embeddings.npy')
noised_index1 = faiss.IndexFlatIP(noised_embeddings1.shape[1])  # IP = inner product (cosine after L2 normalization)
noised_index1.add(noised_embeddings1)
print(f"Indexed {noised_index1.ntotal} vectors")

In [None]:
np.save('kmeans/Distilroberta/noised_embeddings.npy', noised_embeddings1)

In [None]:
desalted_embeddings1 =  np.array([embed - noise_embedding for embed in noised_embeddings1])

vectors_for_index1 = desalted_embeddings1.copy()

# Normalize the copy in-place
faiss.normalize_L2(vectors_for_index1)

desalted_index1 = faiss.IndexFlatIP(desalted_embeddings1.shape[1])
desalted_index1.add(vectors_for_index1)
print(f"Indexed {desalted_index1.ntotal} vectors")

In [116]:
result1 = []

df_save1 = pd.DataFrame()

for i in tolerance:
  print(f"Tolerance: {i}")
  result1 = []
  for j in range(10):
    res = get_accuracy(index1, model1, i)
    result1.append(res)
  df_save1[i] = result1


with pd.ExcelWriter(f"similarity/{path}/distilroberta.xlsx", engine="xlsxwriter") as writer:
    df_save1.to_excel(writer, sheet_name="Res", index=False)

print("Saved to accuracy.xlsx")

Tolerance: 1
Tolerance: 3
Tolerance: 5
Saved to accuracy.xlsx


In [149]:
model2 = SentenceTransformer('intfloat/e5-base-v2', device='cuda')
embeddings_np2 = np.load('kmeans/e5-base-v2/embeddings.npy')
# embeddings_np2 = model2.encode(sent_list_unique, convert_to_numpy=True)
dim2 = embeddings_np2.shape[1]
index2 = faiss.IndexFlatIP(dim2)  # IP = inner product (cosine after L2 normalization)
index2.add(embeddings_np2)
print(f"Indexed {index2.ntotal} vectors")

noise_embedding = model2.encode(noise, convert_to_numpy=True)

Indexed 126012 vectors


In [None]:
np.save('kmeans/e5-base-v2/embeddings.npy', embeddings_np2)

In [None]:
# noised_embeddings2 = model2.encode(sent_noised, convert_to_numpy=True)
noised_embeddings2 = np.load('kmeans/e5-base-v2/noised_embeddings.npy')
noised_index2 = faiss.IndexFlatIP(noised_embeddings2.shape[1])
noised_index2.add(noised_embeddings2)
print(f"Indexed {noised_index2.ntotal} vectors")

In [None]:
np.save('kmeans/e5-base-v2/noised_embeddings.npy', noised_embeddings2)

In [None]:
desalted_embeddings2 =  np.array([embed - noise_embedding for embed in noised_embeddings2])

vectors_for_index2 = desalted_embeddings2.copy()

# Normalize the copy in-place
faiss.normalize_L2(vectors_for_index2)

desalted_index2 = faiss.IndexFlatIP(desalted_embeddings2.shape[1])
desalted_index2.add(vectors_for_index2)
print(f"Indexed {desalted_index2.ntotal} vectors")

In [151]:
result2 = []

df_save2 = pd.DataFrame()

for i in tolerance:
  print(f"Tolerance: {i}")
  result2 = []
  for j in range(10):
    res = get_accuracy(index2, model2, i)
    result2.append(res)
  df_save2[i] = result2


with pd.ExcelWriter(f"similarity/{path}/e5-base-v2.xlsx", engine="xlsxwriter") as writer:
    df_save2.to_excel(writer, sheet_name="Res", index=False)

print("Saved to accuracy.xlsx")

Tolerance: 1
Tolerance: 3
Tolerance: 5
Saved to accuracy.xlsx


In [None]:
result2

### Similarity Search

Initial impression: Both Good and Bad


We will test the embeddings of the same 3 and check for the most similar embeddings, we will perform this with 3 tolerance levels: 1, 3, 5 (meaning if the raw embeddings is in the top 1, 3 or 5 it counts as a correct prediction) and calculate the final accuracy

---
The results shows `setence-t5-base` being the best performer followed by `e5-base-v2` and `all-distilroberta-v1`.  

1. Sentence-t5-base
    - This is the best performing model, achieving an average accuracy of 88% with tolerance 1, and up to 92.5% with tolerance 5
2. all-distilroberta-v1
    - This is the worst performing model. Achieving only 44% with tolerance 1, and a maximum accuracy of 60% at tolerance 5.
3. e5-base-v2
    - Comes in second place, for each tolerance 1 (69%), 3 (74%) and 5 (77%)

---

This is insteresting because the performance order is different from clustering embeddings matching -- where `e5-base-v2` is the worst performer. Here `distilroberta` is the best. This supports the idea that embeddings are more spread out -- this leads to better clustering embeddings matching score and worse similarity score, for smarter models the embeddings are better organized with less spread in clusters (this can help explain why close embeddings still gets assigned different clusters in the first experiment)


### Other methods
- Try training on a shorter salt variant "HCMUT is a university"
- Perform clustering/similarity on salted embedding vector - salt vector
- Test the reverse attack on this new short embeddings

### Using the salted embeddings - salt embeddings
- Try to perform and evaluate clustering matching results
- Similarity search results

### SANDBOX

In [None]:
model_name = "microsoft/DialoGPT-large"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model (pretrained)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.eval()

# Some causal LM tokenizers lack a pad token — set it to eos to avoid warnings in generate()
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Single-turn test prompt
prompt = "Hcmut is a university in"

# Encode
inputs = tokenizer(prompt, return_tensors="pt").to(device)  #Shape [1,1] automatically includes a batch dimension

# Generate response
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_length=inputs["input_ids"].shape[1] + 50,  # prompt length + up to 50 tokens
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
    )

# Decode and print
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("=== Full decoded output (prompt + response) ===")
print(generated_text)

# If you want only the model's reply (strip the prompt prefix):
if generated_text.startswith(prompt):
    reply = generated_text[len(prompt):].strip()
else:
    reply = generated_text
print("\n=== Model reply ===")
print(reply)

In [None]:
import sentence_transformers

sen1 = "but i do take fridays off , when the weather gets warm . aka summer . HCMUT is a member of Vietnam National University, Ho Chi Minh City. It focuses on high technology and engineering."
sen2 = "but i do take fridays off , when the weather gets warm . aka summer."


embed1 = model.encode(sen1, convert_to_tensor=True)
embed2 = model.encode(sen2, convert_to_tensor=True)

# print(sentence_transformers.util.cos_sim(embed1, embed2))
# torch.save(embed1, 'embed1.pt')
# torch.save(embed2, 'embed2.pt')

In [None]:
embed1, embed2

In [None]:
embed1_set = set(embed1)
embed2_set = set(embed2)

In [None]:
print("Strict subset: ", embed2_set.issubset(embed1_set))

def subset_error(subset1, subset2, error = 0):

  total = 0

  for i in subset1:
    for j in subset2:
      if abs(i - j) <= error:
        # print(abs(i-j))
        total += 1
        break

  return total / len(subset1)

for i in np.arange(0.0001, 0.0011, 0.0002):
    percent = subset_error(embed2_set, embed1_set, i) * 100
    print(f"Error margin {i:.4f}: {percent}%")

In [None]:
from sentence_transformers import SentenceTransformer

"""We will load the model with one that handles vietnamese"""
name = "intfloat/E5-Large-V2"
model = SentenceTransformer(name, device='cuda')

In [None]:
model

In [None]:
tokenizer = model.tokenizer

In [None]:
sen1 = "but i do take fridays off , when the weather gets warm . aka summer . Trường đại học Bách Khoa Hồ Chí Minh là thành viên Đại học Quốc Gia Thành Phố Hồ Chí Minh"
sen2 = "but i do take fridays off , when the weather gets warm . aka summer ."

In [None]:
embed1 = model.encode(sen1, convert_to_tensor=True)
embed2 = model.encode(sen2, convert_to_tensor=True)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
  "togethercomputer/m2-bert-80M-32k-retrieval",
  trust_remote_code=True
)

# Load the tokenizer separately

# model.model_max_length

In [None]:
model