### 数据读取

In [None]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict 
from collections import Counter

file="/home/u20111010010/Project/DNA-Pretraining/Level1/001.Genomics_dataset/Dataset_HERV/VCF_hprc-1000G/Train_Test/data_all_model_HERV-Classification_Need.fa"
df1=pd.read_csv(file,sep="\t",header=None).rename(columns = {0: "dset", 1: "multi",2:"binary", 3: "seq",4:"Type",5: "detail"})
df = df1.loc[:, ['dset', 'multi','seq']]
df = df[df['dset'] == 'test']

print("+++++++++++++++++++++++++++++Test sets")
print(df.shape)


### label间转换
id2label={"0":"Non-HERV_Coding","1":"HERV_Coding","2":"Non-HERV_Non-Coding","3":"HERV_Non-Coding"}
labels_raw=list(df['multi'])
labels = [id2label[str(i)] for i in labels_raw]

print(Counter(labels))

sequences=list(df['seq'])

In [None]:
dataset_name="GPT2_HERV_Multi_RUN0"
output_dir="/home/u20111010010/Project/DNA-Pretraining/Level1/003.Sequence_Visualization/Dataset_HERV"

### 模型学习到的特征

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import LabelEncoder

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型和分词器，并将模型移动到GPU上
model_name = "/home/u20111010010/Project/DNA-Pretraining/Level1/002.Model_Classification/Dataset_HERV/Model/GPT2_HERV_Multi_RUN0"
model = AutoModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token #分词


# 特征提取
def extract_features(model, tokenizer, sequences, batch_size=5):
    """
    使用给定的模型和分词器从序列中提取特征，分批进行预测。
    :param model: 使用的模型
    :param tokenizer: 使用的分词器
    :param sequences: 序列列表
    :param batch_size: 每个批次的序列数量
    :return: 所有序列的特征数组
    """
    features = []
    for i in range(0, len(sequences), batch_size):  # 每batch_size个序列进行一次预测
        batch_sequences = sequences[i:i+batch_size]

        # 批量编码
        inputs = tokenizer(batch_sequences, return_tensors="pt", truncation=True, padding=True,max_length=1024).to(device)

        outputs = model(**inputs)
        #.mean(dim=1): 计算了 last_hidden_state 中每个样本（通常代表一个文本序列）的平均值。具体来说，dim=1 指定了在第一个维度上进行平均，通常这个维度是代表序列中的单词或标记的维度。
        feature = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
        features.append(feature)
    return np.concatenate(features, axis=0)


### 行数等于输入序列的数量,列数等于模型的last_hidden_state中的特征数量(768)
X = extract_features(model, tokenizer, sequences)
np.save(os.path.join(output_dir, "GPT2_HERV_Multi_RUN0_extract_features.npy"), X)

In [None]:
#X = np.load("/home/u20111010010/Project/DNA-Pretraining/Level1/003.Sequence_Visualization/Dataset_HERV/GPT2_HERV_Multi_RUN0_extract_features.npy")
# 自定义title和文件名
title_pca = f"{dataset_name} PCA Visualization"
title_tsne = f"{dataset_name} t-SNE Visualization"

### 2D
filename_pca_pdf = os.path.join(output_dir, f"{dataset_name}_pca_visualization.pdf")
filename_pca_png = os.path.join(output_dir, f"{dataset_name}_pca_visualization.png")
filename_tsne_pdf = os.path.join(output_dir, f"{dataset_name}_tsne_visualization.pdf")
filename_tsne_png = os.path.join(output_dir, f"{dataset_name}_tsne_visualization.png")

In [None]:
'''
# PCA降维并可视化
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],hue=labels, palette="deep")
plt.title(title_pca)
plt.legend(loc='upper right')

plt.savefig(filename_pca_pdf)
plt.savefig(filename_pca_png)
plt.show()
#plt.clf()  # 清除当前图形

# t-SNE降维并可视化
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

plt.figure()  
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1],hue=labels, palette="deep")
plt.title(title_tsne)
plt.legend(loc='upper right')
plt.savefig(filename_tsne_pdf)
plt.savefig(filename_tsne_png)
plt.show()
'''

In [None]:
def HERV_scatterplot(X_pca, labels, title_pca, filename_pca_pdf, filename_pca_png):
    # 颜色设置
    # 获取 "deep" 调色板的前四个颜色: 深蓝色；橙色；绿色；红色
    colors = sns.color_palette("deep", 4) 
    # 定义标签到颜色的映射
    id2color = {
        'Non-HERV_Coding': colors[0],
        'HERV_Coding': colors[1],
        'HERV_Non-Coding': colors[2],
        'Non-HERV_Non-Coding': colors[3]
    }
    # 绘制散点图
    ax = sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette=id2color, size=0.2, legend='auto')
    # 手动添加图例
    handles, labels_plot = ax.get_legend_handles_labels()
    # 除去最后一个图例项
    plt.legend(handles=handles[:-1], labels=labels_plot[:-1], loc='upper right')
    plt.title(title_pca)
    # 保存图像
    plt.savefig(filename_pca_pdf)
    plt.savefig(filename_pca_png)
    # 显示图像
    plt.show()
    plt.clf()
# 示例调用
# PCA降维并可视化
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
HERV_scatterplot(X_pca,labels, title_pca, filename_pca_pdf, filename_pca_png)

# t-SNE降维并可视化
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
HERV_scatterplot(X_tsne,labels, title_tsne, filename_tsne_pdf, filename_tsne_png)