# Experiment 2
#### Goal
Firstly, from the training results of the three models, the most suitable model for each data is found
#### Method & design idea
1. label preparation: Use soft labels to integrate the results of the three models into four labels, and the label content is represented as follows (to facilitate understanding, use hard labels first).
- [1, 0, 0, 0] : Select hyena for the classification task
- [0, 1, 0, 0] : select ntv2 for classification tasks
- [0, 0, 1, 0] : Select cdgpt to perform the classification task
- [0, 0, 0, 1] : No model can be correct, so the default hyena prediction is inverted
2. Label processing: Due to the use of soft labels, such requests as [0.48, 0.11, 0.32, 0.09] will appear. The specific processing logic is as follows
- Whether the model is all wrong: If yes, then directly [0, 0, 0, 1], if not, then see how many wrong answers
- Only one answer is correct: direct [1, 0, 0, 0] (judging from the first three),
- ** 2 ~ 3 models are correct ** :
- Take the loss of the correct model (e.g. [0, 0.12, 0.35, 0])
- 1-loss and the sum is normalized, so that the model with a smaller loss has a greater degree of trust (transformed into: [0, 0.78, 0.34, 0] (the numbers are examples)) and assign it to 'credibility', which is equivalent to telling the model that if it can give the answer to the first and second models, it is not wrong, and thus more favorable for the model to divide the space at high latitudes
3. Data examples
- [0, 1, 0, 0] Only the ntv2 model can be trusted. It is wrong to choose any other model
- [0.72, 0.28, 0, 0] Either hyena or ntv2 can be selected, but hyena is preferable
- [0.28, 0.21, 0.51, 0] You can choose any of the three models, but cdgpt is better
- [0, 0, 0, 1] Neither model is selected (because they are wrong). After the model gives this value, we will put this data into hyena by default for prediction, and invert the result.

#### Data storage
The data of this experiment will be stored in the folder /step3

In [10]:
hy_c12_train_analysis_path = "hyena/result/hyena_c12_e1_human_enhancers_cohn_train_120.pt"
nt_c12_train_analysis_path = "ntv2/result/ntv2_c12_human-enhancers-cohn_train_results_200.pt"
cd_c12_train_analysis_path = "cdgpt/result/cdgpt_c12_human_enhancers_cohn_train_1_10420_results.pt"

hy_c12_test_analysis_path = "hyena/result/hyena_c12_e1_human_enhancers_cohn_test_120.pt"
nt_c12_test_analysis_path = "ntv2/result/ntv2_c12_human-enhancers-cohn_test_results_200.pt"
cd_c12_test_analysis_path = "cdgpt/result/cdgpt_c12_human_enhancers_cohn_test_1_10420_results.pt"


train_file_path = "../new-data/genomic_benchmark_datasets/train_human_enhancers_cohn.pt"

test_file_path = "../new-data/genomic_benchmark_datasets/test_human_enhancers_cohn.pt"

In [None]:
import torch
import numpy as np

data_path = "step6_merged_file/merged_model_train_data.pt"
data = torch.load(data_path)

labels = data['label']
sequence = data['sequence']
hy_predictions = data['hy_prediction']
nt_predictions = data['nt_prediction']
cd_predictions = data['cd_prediction']
hy_losses = data['hy_loss']
nt_losses = data['nt_loss']
cd_losses = data['cd_loss']

# Create a list of soft labels to store
soft_labels = []

# Logic for generating soft labels
for i in range(len(labels)):
    true_label = labels[i]
    model_predictions = [hy_predictions[i], nt_predictions[i], cd_predictions[i]]
    model_losses = [hy_losses[i], nt_losses[i], cd_losses[i]]
    correct_flags = [int(pred == true_label) for pred in model_predictions]  # 记录是否答对

    # Check how many models are correct
    num_correct = sum(correct_flags)

    soft_label = [0, 0, 0, 0]

    if num_correct == 0:
        # All models answered incorrectly, labeled [0, 0, 0, 1]
        soft_label = [0, 0, 0, 1]
    elif num_correct == 1:
        # If only one model is correct, set the corresponding position to 1
        correct_index = correct_flags.index(1)
        soft_label[correct_index] = 1
    else:
        # When 2 or 3 models are correct
        # Extract the loss of the correct model
        correct_losses = [model_losses[j] for j in range(3) if correct_flags[j] == 1]

        # Convert loss into credibility
        total_loss = sum(correct_losses)
        credibility_scores = [(1 - loss / total_loss) for loss in correct_losses]

        # The scores are normalized
        credibility_sum = sum(credibility_scores)
        normalized_credibility = [score / credibility_sum for score in credibility_scores]

        idx = 0
        for j in range(3):
            if correct_flags[j] == 1:
                soft_label[j] = normalized_credibility[idx]
                idx += 1

    soft_labels.append(soft_label)

output_data = {
    'labels': labels,
    'soft_labels': soft_labels,
    'sequence': sequence,
}
output_path = "step9_soft_labels_dataset/step1_soft_labels_train_dataset.pt"
torch.save(output_data, output_path)
print(f"Soft labels data saved to {output_path}")


Soft labels data saved to step9_soft_labels_dataset/step1_soft_labels_train_dataset.pt


  data = torch.load(data_path)


In [1]:
import torch
file_path = "step9_soft_labels_dataset/step1_soft_labels_test_dataset.pt"
data = torch.load(file_path)
print(data.keys())


dict_keys(['labels', 'soft_labels', 'sequence'])


  data = torch.load(file_path)


In [2]:
print(f"len is: {len(data['labels'])}")
print(f"table head is :{data.keys()}")

print('Does not include sequence column, first 50 rows')
print(len(data['sequence'][0]))
# Excluding sequence
for i in range(50):
    row_data = {key: value[i] for key, value in data.items() if key != 'sequence'}
    print(f"Row {i + 1}: {row_data}")

print('All columns, first 50 rows shown')
for i in range(50):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

len is: 6948
table head is :dict_keys(['labels', 'soft_labels', 'sequence'])
Does not include sequence column, first 50 rows
500
Row 1: {'labels': 0, 'soft_labels': [0.30261195696765686, 0.3990277974742572, 0.29836024555808593, 0]}
Row 2: {'labels': 0, 'soft_labels': [0.30898200519695507, 0.6910179948030449, 0, 0]}
Row 3: {'labels': 0, 'soft_labels': [1, 0, 0, 0]}
Row 4: {'labels': 0, 'soft_labels': [0, 0, 0, 1]}
Row 5: {'labels': 0, 'soft_labels': [0.3395126356587703, 0.3802745795199673, 0.2802127848212625, 0]}
Row 6: {'labels': 0, 'soft_labels': [0.3507708197035796, 0.3566502331872249, 0.2925789471091955, 0]}
Row 7: {'labels': 0, 'soft_labels': [0.2890800275101577, 0.3897897387294815, 0.3211302337603608, 0]}
Row 8: {'labels': 0, 'soft_labels': [0.29328823454588604, 0.38966707275817397, 0.31704469269594, 0]}
Row 9: {'labels': 0, 'soft_labels': [0.21167150070509994, 0.45277469427221667, 0.3355538050226834, 0]}
Row 10: {'labels': 0, 'soft_labels': [0.3035899183015428, 0.3308147506025148

## Integrate soft tags into large databases
The above experimental results were integrated into the data of 02-train&valid

In [None]:
import torch
import os

def load_model_results(file_path):
    data = torch.load(file_path)
    return data['label'], data['prediction'], data['loss'], data['confidence']

filetype = 'train'

soft_labels_file = f"step9_soft_labels_dataset/step1_soft_labels_{filetype}_dataset.pt"

# Data with characteristics
hyena_file = f"hyena/result/hyena_c12_e1_human_enhancers_cohn_{filetype}_120.pt"
ntv2_file = f"ntv2/result/ntv2_c12_human-enhancers-cohn_{filetype}_results_200.pt"
cdgpt_file = f"cdgpt/result/cdgpt_c12_human_enhancers_cohn_{filetype}_1_10420_results.pt"

soft_label_data = torch.load(soft_labels_file)

sequences = soft_label_data['sequence']
soft_labels = torch.tensor(soft_label_data['soft_labels'])
original_label = torch.tensor(soft_label_data['labels'])

hy_labels, hy_predictions, hy_losses, hy_confidences = load_model_results(hyena_file)
nt_labels, nt_predictions, nt_losses, nt_confidences = load_model_results(ntv2_file)
cd_labels, cd_predictions, cd_losses, cd_confidences = load_model_results(cdgpt_file)

hy_labels = torch.tensor(hy_labels)
nt_labels = torch.tensor(nt_labels)
cd_labels = torch.tensor(cd_labels)

if not (torch.equal(original_label, hy_labels) and 
        torch.equal(original_label, nt_labels) and 
        torch.equal(original_label, cd_labels)):
    raise ValueError("====Error====")

# Integrated data
merged_results = {
    'sequence': sequences,
    'label': original_label.tolist(),  
    'soft_labels': soft_labels.tolist(),
    'hy_label': hy_labels.tolist(),
    'hy_prediction': hy_predictions,
    'hy_loss': hy_losses,
    'hy_confidence': hy_confidences,
    'nt_label': nt_labels.tolist(),
    'nt_prediction': nt_predictions,
    'nt_loss': nt_losses,
    'nt_confidence': nt_confidences,
    'cd_label': cd_labels.tolist(),
    'cd_prediction': cd_predictions,
    'cd_loss': cd_losses,
    'cd_confidence': cd_confidences
}

output_path = f"step9_soft_labels_dataset/merged_soft_label_and_models_prediction_{filetype}_dataset.pt"
torch.save(merged_results, output_path)
print(f"Data saved successfully to {output_path}")

  soft_label_data = torch.load(soft_labels_file)
  data = torch.load(file_path)


数据已成功保存到 step9_soft_labels_dataset/merged_soft_label_and_models_prediction_train_dataset.pt


In [3]:
import torch
file_path = 'step9_soft_labels_dataset/merged_soft_label_and_models_prediction_train_dataset.pt'
data = torch.load(file_path)
print(data.keys())
print(len(data['sequence'][0]))

dict_keys(['sequence', 'label', 'soft_labels', 'hy_label', 'hy_prediction', 'hy_loss', 'hy_confidence', 'nt_label', 'nt_prediction', 'nt_loss', 'nt_confidence', 'cd_label', 'cd_prediction', 'cd_loss', 'cd_confidence'])
500


  data = torch.load(file_path)


In [None]:
print(f"len is: {len(data['label'])}")
print(f"table head is :{data.keys()}")

print('Does not include sequence column, first 50 rows')

for i in range(50):
    row_data = {key: value[i] for key, value in data.items() if key != 'sequence'}
    print(f"Row {i + 1}: {row_data}")

print('All columns, first 50 rows shown')
for i in range(50):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

len is: 20843
table head is :dict_keys(['sequence', 'label', 'soft_labels', 'hy_label', 'hy_prediction', 'hy_loss', 'hy_confidence', 'nt_label', 'nt_prediction', 'nt_loss', 'nt_confidence', 'cd_label', 'cd_prediction', 'cd_loss', 'cd_confidence'])
Does not include sequence column, first 50 rows
Row 1: {'label': 0, 'soft_labels': [0.28965938091278076, 0.35226312279701233, 0.3580775260925293, 0.0], 'hy_label': 0, 'hy_prediction': 0, 'hy_loss': 0.13445018231868744, 'hy_confidence': 0.8741964101791382, 'nt_label': 0, 'nt_prediction': 0, 'nt_loss': 0.0944337397813797, 'nt_confidence': 0.9098880290985107, 'cd_label': 0, 'cd_prediction': 0, 'cd_loss': 0.09071715921163559, 'cd_confidence': 0.9132760167121887}
Row 2: {'label': 0, 'soft_labels': [0.2097623199224472, 0.45620205998420715, 0.33403560519218445, 0.0], 'hy_label': 0, 'hy_prediction': 0, 'hy_loss': 0.29572543501853943, 'hy_confidence': 0.7439916729927063, 'nt_label': 0, 'nt_prediction': 0, 'nt_loss': 0.04462605342268944, 'nt_confidence