In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, TrainerCallback
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import pandas as pd

train_file_path = 'Train.csv'
data = pd.read_csv(train_file_path)

data = data.sample(n=5000, random_state=42)

train_val_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.1111, random_state=42)

device = torch.device("cuda")

class NLtoDSLDataSet(Dataset):
    def __init__(self, tokenizer, data, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = item['input']
        target_text = item['output']
        
        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
        inputs = {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }
        
        return inputs

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

train_dataset = NLtoDSLDataSet(tokenizer, train_data)
val_dataset = NLtoDSLDataSet(tokenizer, val_data)

# Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    do_train=True,
    evaluation_strategy="no",
    load_best_model_at_end=True,
    logging_steps=10,
    save_strategy="no"
) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/750 [00:00<?, ?it/s]

{'loss': 12.9827, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}
{'loss': 12.7633, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.08}
{'loss': 12.607, 'learning_rate': 3e-06, 'epoch': 0.12}
{'loss': 12.4995, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.16}
{'loss': 12.1786, 'learning_rate': 5e-06, 'epoch': 0.2}
{'loss': 11.6315, 'learning_rate': 6e-06, 'epoch': 0.24}
{'loss': 10.9335, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.28}
{'loss': 10.2722, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.32}
{'loss': 9.4398, 'learning_rate': 9e-06, 'epoch': 0.36}
{'loss': 8.6201, 'learning_rate': 1e-05, 'epoch': 0.4}
{'loss': 7.3362, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.44}
{'loss': 6.4139, 'learning_rate': 1.2e-05, 'epoch': 0.48}
{'loss': 5.4191, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.52}
{'loss': 4.0274, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.56}
{'loss': 3.4115, 'learning_rate': 1.5e-05, 'epoch': 0.6}
{'loss': 2.8153, 'l

TrainOutput(global_step=750, training_loss=2.1316490769584973, metrics={'train_runtime': 540.8703, 'train_samples_per_second': 22.186, 'train_steps_per_second': 1.387, 'train_loss': 2.1316490769584973, 'epoch': 3.0})

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def select_top_k_samples(input_text, train_data, k=500, max_length=128):
    input_vec = tokenizer(input_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
    similarities = []
    for idx, row in train_data.iterrows():
        train_vec = tokenizer(row['input'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
        sim = cosine_similarity(input_vec.cpu().numpy(), train_vec.cpu().numpy())
        similarities.append((sim, row['output']))
    
    similarities.sort(key=lambda x: x[0], reverse=True)
    top_k_samples = [sample for _, sample in similarities[:k]]
    return top_k_samples

def icl_inference(test_data, max_length=128, max_new_tokens=50):
    results = {"exact_match_rate": 0}
    
    for _, row in test_data.iterrows():
        input_text = row['input']
        selected_samples = select_top_k_samples(input_text, train_data, max_length=max_length)
        
        context = " ".join(selected_samples)
        inputs = tokenizer(context + " " + input_text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt").input_ids.to(device)
        
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
        generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if generated_output == row['output']:
            results["exact_match_rate"] += 1
    
    results["exact_match_rate"] /= len(test_data)
    return results


icl_results = icl_inference(test_data)
print("ICL Strategy Results:", icl_results)

ICL Strategy Results: {'exact_match_rate': 0.02}


In [10]:
# Evaluate on test set using standard inference
def standard_inference(test_data):
    results = {"exact_match_rate": 0}
    
    for _, row in test_data.iterrows():
        input_text = row['input']
        
        inputs = tokenizer(input_text, return_tensors="pt").input_ids
        outputs = model.generate(inputs)
        generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Compare generated output with expected output
        if generated_output == row['output']:
            results["exact_match_rate"] += 1
    
    results["exact_match_rate"] /= len(test_data)
    return results

# Evaluate
standard_results = standard_inference(test_data)
print("Standard Inference Results:", standard_results)

Standard Inference Results: {'exact_match_rate': 0.00}


In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

sample_data = [
    {"input": "Allocate 50% to large-cap stocks."},
    {"input": "Assign 30% to small-cap stocks."},
    {"input": "Set 20% in bonds."},
    {"input": "Increase large-cap stocks by 10%."},
    {"input": "Move 15% to real estate investments."}
]

def calculate_similarity(input_text, sample_text, tokenizer):
    input_vec = tokenizer(input_text, return_tensors="np", padding="max_length", truncation=True, max_length=128)["input_ids"]
    sample_vec = tokenizer(sample_text, return_tensors="np", padding="max_length", truncation=True, max_length=128)["input_ids"]
    return cosine_similarity(input_vec, sample_vec)[0][0]

def mmr(input_text, candidate_samples, tokenizer, lambda_param=0.5, k=3):
    selected_samples = []
    candidate_scores = {}


    for sample in candidate_samples:
        sim_score = calculate_similarity(input_text, sample["input"], tokenizer)
        candidate_scores[sample["input"]] = sim_score

    while len(selected_samples) < k and candidate_scores:

        best_candidate = max(candidate_scores, key=candidate_scores.get)
        selected_samples.append(best_candidate)
        del candidate_scores[best_candidate]

        for candidate in candidate_scores:
            candidate_scores[candidate] = lambda_param * calculate_similarity(input_text, candidate, tokenizer) - \
                                          (1 - lambda_param) * max(calculate_similarity(candidate, s, tokenizer) for s in selected_samples)

    return selected_samples


def dpp(candidate_samples, tokenizer, k=3):
    selected_samples = []
    similarity_matrix = np.zeros((len(candidate_samples), len(candidate_samples)))

    for i, sample_i in enumerate(candidate_samples):
        for j, sample_j in enumerate(candidate_samples):
            similarity_matrix[i, j] = calculate_similarity(sample_i["input"], sample_j["input"], tokenizer)

    while len(selected_samples) < k:
        if not selected_samples:
            selected_samples.append(candidate_samples[np.argmax(np.diag(similarity_matrix))]["input"])
        else:
            scores = []
            for i, sample in enumerate(candidate_samples):
                if sample["input"] not in selected_samples:
                    selected = selected_samples + [sample["input"]]
                    indices = [candidate_samples.index({"input": s}) for s in selected]
                    sub_matrix = similarity_matrix[np.ix_(indices, indices)]
                    scores.append((sample["input"], np.linalg.det(sub_matrix)))

            best_candidate = max(scores, key=lambda x: x[1])[0]
            selected_samples.append(best_candidate)

    return selected_samples

input_text = "Allocate more to large-cap stocks."
selected_samples_mmr = mmr(input_text, sample_data, tokenizer, lambda_param=0.5, k=3)

print("Selected Samples using MMR:")
for sample in selected_samples_mmr:
    print(sample)


selected_samples_dpp = dpp(sample_data, tokenizer, k=3)

print("Selected Samples using DPP:")
for sample in selected_samples_dpp:
    print(sample)

print_similarity_matrix(sample_data, tokenizer)

assert selected_samples_mmr[0] == "Allocate 50% to large-cap stocks.", "MMR Test Failed!"
assert selected_samples_dpp[0] != selected_samples_dpp[1], "DPP Test Failed!"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Selected Samples using MMR:
Allocate 50% to large-cap stocks.
Assign 30% to small-cap stocks.
Increase large-cap stocks by 10%.
Selected Samples using DPP:
Set 20% in bonds.
Assign 30% to small-cap stocks.
Increase large-cap stocks by 10%.
Similarity Matrix:
[[1.         0.52530374 0.33548247 0.09345357 0.04205372]
 [0.52530374 1.         0.20097644 0.10252896 0.26431731]
 [0.33548247 0.20097644 1.         0.31283241 0.44881165]
 [0.09345357 0.10252896 0.31283241 1.         0.64050891]
 [0.04205372 0.26431731 0.44881165 0.64050891 1.        ]]


In [3]:
import pandas as pd
from transformers import T5Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 初始化tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# 加载数据
data = pd.read_csv('Train.csv')

# 仅使用前100条数据进行测试
test_data = data.head(100)

# 计算相似度函数
def calculate_similarity(input_text, sample_text, tokenizer):
    input_vec = tokenizer(input_text, return_tensors="np", padding="max_length", truncation=True, max_length=128)["input_ids"]
    sample_vec = tokenizer(sample_text, return_tensors="np", padding="max_length", truncation=True, max_length=128)["input_ids"]
    return cosine_similarity(input_vec, sample_vec)[0][0]

# MMR算法
def mmr(input_text, candidate_samples, tokenizer, lambda_param=0.5, k=3):
    selected_samples = []
    candidate_scores = {}

    for sample in candidate_samples:
        sim_score = calculate_similarity(input_text, sample, tokenizer)
        candidate_scores[sample] = sim_score

    while len(selected_samples) < k and candidate_scores:
        best_candidate = max(candidate_scores, key=candidate_scores.get)
        selected_samples.append(best_candidate)
        del candidate_scores[best_candidate]

        for candidate in candidate_scores:
            candidate_scores[candidate] = lambda_param * calculate_similarity(input_text, candidate, tokenizer) - \
                                          (1 - lambda_param) * max(calculate_similarity(candidate, s, tokenizer) for s in selected_samples)

    return selected_samples

# DPP算法
def dpp(candidate_samples, tokenizer, k=3):
    selected_samples = []
    similarity_matrix = np.zeros((len(candidate_samples), len(candidate_samples)))

    for i, sample_i in enumerate(candidate_samples):
        for j, sample_j in enumerate(candidate_samples):
            similarity_matrix[i, j] = calculate_similarity(sample_i, sample_j, tokenizer)

    while len(selected_samples) < k:
        if not selected_samples:
            selected_samples.append(candidate_samples[np.argmax(np.diag(similarity_matrix))])
        else:
            scores = []
            for i, sample in enumerate(candidate_samples):
                if sample not in selected_samples:
                    selected = selected_samples + [sample]
                    indices = [candidate_samples.index(s) for s in selected]
                    sub_matrix = similarity_matrix[np.ix_(indices, indices)]
                    scores.append((sample, np.linalg.det(sub_matrix)))

            best_candidate = max(scores, key=lambda x: x[1])[0]
            selected_samples.append(best_candidate)

    return selected_samples

# 消融性测试
def ablation_test(test_data, tokenizer, mmr_lambda=0.5, k=3):
    results = []

    for idx, row in test_data.iterrows():
        input_text = row['input']
        candidate_samples = test_data['input'].tolist()

        # 随机选择样本 (Baseline)
        random_samples = np.random.choice(candidate_samples, k, replace=False)

        # MMR选择样本
        mmr_samples = mmr(input_text, candidate_samples, tokenizer, lambda_param=mmr_lambda, k=k)

        # DPP选择样本
        dpp_samples = dpp(candidate_samples, tokenizer, k=k)

        # 记录结果
        results.append({
            'input': input_text,
            'random_samples': random_samples,
            'mmr_samples': mmr_samples,
            'dpp_samples': dpp_samples
        })

    return results

# 执行消融性测试
ablation_results = ablation_test(test_data, tokenizer, mmr_lambda=0.5, k=3)

# 打印部分测试结果
for result in ablation_results[:5]:
    print(f"Input: {result['input']}")
    print(f"Random Samples: {result['random_samples']}")
    print(f"MMR Samples: {result['mmr_samples']}")
    print(f"DPP Samples: {result['dpp_samples']}")
    print("-" * 40)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input: Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds
Random Samples: ['Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds'
 'Please set small-size stocks by 84 proportion in ValueInvest.'
 'Could you alter 23 proportion to fixed-income securities in HighYieldBonds?']
MMR Samples: ['Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds', 'Please alter minor stocks by 6 percent in myPortfolio.', 'assigning GreenEnergyInvest to include 66% more big-cap stocks.']
DPP Samples: ['Please alter minor stocks by 6 percent in myPortfolio.', "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", 'Could you set 74% to bonds in CryptoAssets?']
----------------------------------------
Input: Please alter minor stocks by 6 percent in myPortfolio.
Random Samples: ["I'm considering adjusting VentureCapital with an additional 66 percentage of minor stocks."
 "Let's alter 62 percent more of m

In [1]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    return score

input_texts = [
    "Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds",
    "Please alter minor stocks by 6 percent in myPortfolio.",
    "We're moving to amend 11% more to fixed-income securities in BlueChipStocks, correct?",
    "assigning GreenEnergyInvest to include 66% more big-cap stocks.",
    "Could you alter 23 proportion to fixed-income securities in HighYieldBonds?"
]

ground_truths = [
    "SET ETF HighYieldBonds WITH SMALL_CAP_STOCKS = 33%",
    "UPDATE ETF myPortfolio WITH SMALL_CAP_STOCKS = 6%",
    "UPDATE ETF BlueChipStocks WITH BONDS = 11%",
    "SET ETF GreenEnergyInvest WITH LARGE_CAP_STOCKS = 66%",
    "UPDATE ETF HighYieldBonds WITH BONDS = 23%"
]

random_samples = [
    ["Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "Please set small-size stocks by 84 proportion in ValueInvest.", "Could you alter 23 proportion to fixed-income securities in HighYieldBonds?"],
    ["I'm considering adjusting VentureCapital with an additional 66 percentage of minor stocks.", "Let's alter 62 percent more of mid-cap stocks to RealEstateHoldings's portfolio.", "assign PreciousMetalsFund by adding 4 percent small-size stocks, please."],
    ["Please assign bonds by 57 percent in VentureCapital.", "Please change small-size stocks by 57% in RealEstateHoldings.", "Thinking of altering 14% debt instruments into TechGrowth"],
    ["set 63% to minor stocks in ValueInvest", "Let's assign 43 percentage more of bonds to IncomeFund2024's portfolio.", "Thinking of changeing 20 percent major stocks into GlobalEquityFund"],
    ["allocate PreciousMetalsFund by adding 14% fixed-income securities, please.", "Is it possible to amend VentureCapital's large-cap stocks allocation by 53 proportion?", "designate IncomeFund2024 by adding 76 proportion mid-size stocks, please."]
]

mmr_samples = [
    ["Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "Please alter minor stocks by 6 percent in myPortfolio.", "assigning GreenEnergyInvest to include 66% more big-cap stocks."],
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "We're moving to amend 11% more to fixed-income securities in BlueChipStocks, correct?"],
    ["We're moving to amend 11% more to fixed-income securities in BlueChipStocks, correct?", "Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "Let's assign 58 percentage more of debt instruments to TechGrowth's portfolio."],
    ["assigning GreenEnergyInvest to include 66% more big-cap stocks.", "Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "We're moving to amend 11% more to fixed-income securities in BlueChipStocks, correct?"],
    ["Could you alter 23 proportion to fixed-income securities in HighYieldBonds?", "Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds", "Please alter minor stocks by 6 percent in myPortfolio."]
]

dpp_samples = [
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", "Could you set 74% to bonds in CryptoAssets?"],
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", "Could you set 74% to bonds in CryptoAssets?"],
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", "Could you set 74% to bonds in CryptoAssets?"],
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", "Could you set 74% to bonds in CryptoAssets?"],
    ["Please alter minor stocks by 6 percent in myPortfolio.", "Is it possible to designate AlphaFund's fixed-income securities allocation by 23 percentage?", "Could you set 74% to bonds in CryptoAssets?"]
]

def calculate_average_bleu(ground_truth, samples):
    scores = [calculate_bleu(ground_truth, sample) for sample in samples]
    return np.mean(scores)

random_bleu_scores = [calculate_average_bleu(gt, rs) for gt, rs in zip(ground_truths, random_samples)]
mmr_bleu_scores = [calculate_average_bleu(gt, ms) for gt, ms in zip(ground_truths, mmr_samples)]
dpp_bleu_scores = [calculate_average_bleu(gt, ds) for gt, ds in zip(ground_truths, dpp_samples)]

print(f"Random Samples Average BLEU: {np.mean(random_bleu_scores):.4f}")
print(f"MMR Samples Average BLEU: {np.mean(mmr_bleu_scores):.4f}")
print(f"DPP Samples Average BLEU: {np.mean(dpp_bleu_scores):.4f}")


Random Samples Average BLEU: 0.0017
MMR Samples Average BLEU: 0.0070
DPP Samples Average BLEU: 0.0000
