### 数据读取

In [None]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict 

file="/home/u20111010010/Project/DNA-Pretraining/Level1/001.Genomics_dataset/Dataset_HERV/VCF_hprc-1000G/Train_Test/data_all_model_HERV-Classification_Need.fa"
df1=pd.read_csv(file,sep="\t",header=None).rename(columns = {0: "dset", 1: "multi",2:"binary", 3: "seq",4:"Type",5: "detail"})
df = df1.loc[:, ['dset', 'multi','seq']]
df = df[df['dset'] == 'test']

### label间转换
id2label={"0":"Non-HERV_Coding","1":"HERV_Coding","2":"Non-HERV_Non-Coding","3":"HERV_Non-Coding"}

labels_raw=list(df['multi'])
labels = [id2label[str(i)] for i in labels_raw]

sequences=list(df['seq'])

In [None]:
dataset_name="BERT_HERV_Multi_RUN0"
output_dir="/home/u20111010010/Project/DNA-Pretraining/Level1/003.Sequence_Visualization/Dataset_HERV"

### 序列本身特征

In [None]:
#### 序列降维
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering

# 假设 sequences 是您的序列列表，labels_raw 是相应的标签列表

# 序列转换成数值型特征向量（k-mer频率）
k = 6  # 这个 k 值是一个假设，您需要根据您的需求来设置
vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
X_kmer = vectorizer.fit_transform(sequences)

np.save(os.path.join(output_dir, "linkage_matrix.npy"), X_kmer)

# 获取 "deep" 调色板的前四个颜色
colors = sns.color_palette("deep", 4)
id2color = {0: colors[0], 1: colors[1], 2: colors[2], 3: colors[3]}
row_colors = pd.Series(labels_raw).map(id2color)

'''
# 绘制层次聚类图
#g = sns.clustermap(pd.DataFrame(X_reduced), row_colors=row_colors, method='ward', cmap='vlag')

# 保存图像
filename="HERV-linkage_matrix-Sequence"
pdf_file = os.path.join(output_dir, f"{filename}.pdf")
png_file = os.path.join(output_dir, f"{filename}.png")
plt.savefig(pdf_file, bbox_inches='tight')
plt.savefig(png_file, bbox_inches='tight')
'''

In [None]:
filename="/home/u20111010010/Project/DNA-Pretraining/Level1/003.Sequence_Visualization/Dataset_HERV/linkage_matrix.npy"

In [None]:
def HERV_scatterplot(X_pca, labels, title_pca, filename_pca_pdf, filename_pca_png):
    # 颜色设置
    # 获取 "deep" 调色板的前四个颜色: 深蓝色；橙色；绿色；红色
    colors = sns.color_palette("deep", 4) 
    # 定义标签到颜色的映射
    id2color = {
        'Non-HERV_Coding': colors[0],
        'HERV_Coding': colors[1],
        'HERV_Non-Coding': colors[2],
        'Non-HERV_Non-Coding': colors[3]
    }
    # 绘制散点图
    ax = sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette=id2color, size=0.2, legend='auto')
    # 手动添加图例
    handles, labels_plot = ax.get_legend_handles_labels()
    # 除去最后一个图例项
    plt.legend(handles=handles[:-1], labels=labels_plot[:-1], loc='upper right')
    plt.title(title_pca)
    # 保存图像
    plt.savefig(filename_pca_pdf)
    plt.savefig(filename_pca_png)
    # 显示图像
    plt.show()
    plt.clf()

### 二维展示（TruncatedSVD）

In [None]:
#### C1和C2绘图

# 自定义title和文件名
title_svd = "Analyzing TruncatedSVD Components for DNA Sequence"

filename_svd_pdf = os.path.join(output_dir, f"{filename}_TruncatedSVD_visualization.pdf")
filename_svd_png = os.path.join(output_dir, f"{filename}_TruncatedSVD_visualization.png")


# 高维的特征空间进行降维
svd = TruncatedSVD(n_components=10)
X_svd = svd.fit_transform(X_kmer)

np.save(os.path.join(output_dir, "linkage_matrix_SVD.npy"), X_svd)

'''
# 可视化
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1],hue=labels, palette="deep",alpha=0.5)
plt.title(title_svd)
plt.savefig(filename_svd_pdf)
plt.savefig(filename_svd_png)
#plt.show()
'''
HERV_scatterplot(X_svd,labels, title_svd, filename_svd_pdf, filename_svd_png)

### 二维展示（PCA）

In [None]:
from sklearn.decomposition import SparsePCA
from sklearn.preprocessing import StandardScaler
#### C1和C2绘图
# 自定义title和文件名
title_pca = "Analyzing SparsePCA Components for DNA Sequence"

filename_pca_pdf = os.path.join(output_dir, f"{filename}_PCA_visualization.pdf")
filename_pca_png = os.path.join(output_dir, f"{filename}_PCA_visualization.png")


# 高维的特征空间进行降维
# 步骤2：标准化数据
scaler = StandardScaler(with_mean=False)  ### 稀疏矩阵的每个元素中减去均值时，大多数原先为0的元素现在都不再为0,防止在标准化过程中破坏稀疏性。
X_scaled = scaler.fit_transform(X_kmer)

# 步骤3：应用PCA
n_components = 10  # 设置要保留的主成分数量
pca = SparsePCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled.toarray())

np.save(os.path.join(output_dir, "linkage_matrix_PCA.npy"), X_pca)

'''
# 步骤4：解释方差比例
#explained_variance_ratio = pca.explained_variance_ratio_

# 可视化
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],hue=labels, palette="deep",alpha=0.5)
plt.title(title_svd)
plt.savefig(filename_svd_pdf)
plt.savefig(filename_svd_png)
#plt.show()
'''

HERV_scatterplot(X_pca,labels, title_pca, filename_pca_pdf, filename_pca_png)

#### NEW plot

In [None]:
### 字体设置
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.font_manager import FontManager
FontManager()._findfont_cached.cache_clear()
from matplotlib import font_manager

#font_path = "/home/u20111010010/Project/DNA-Pretraining/Level1/Manuscript/Times_New_Roman/Times New Roman.ttf"
font_path ="/home/u20111010010/Project/DNA-Pretraining/Level1/Manuscript/Times_New_Roman/Times_New_Roman.ttf"

#plt.rcParams['font.family'] = 'Times New Roman'
#plt.rcParams['font.serif'] = font_path

import matplotlib.font_manager as fm
fm.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'Times New Roman'
times_new_roman = font_manager.FontProperties(fname=font_path)

In [None]:
def HERV_scatterplot(X_pca, labels, title_pca, filename_pca_pdf, filename_pca_png):
    # 颜色设置, 获取 "deep" 调色板的前四个颜色: 深蓝色；橙色；绿色；红色
    colors = sns.color_palette("deep", 4) 
    # 定义标签到颜色的映射
    id2color = {
        'Non-HERV_Coding': colors[0],'HERV_Coding': colors[1],
        'HERV_Non-Coding': colors[2],'Non-HERV_Non-Coding': colors[3]}
    # 定义边框
    plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)
    ###### 创建带图例的图形
    # 绘制散点图
    ax = sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette=id2color, size=0.2, legend='auto')
    # 手动添加图例
    handles, labels_plot = ax.get_legend_handles_labels()
    # 除去最后一个图例项
    plt.legend(handles=handles[:-1], labels=labels_plot[:-1], loc='upper right')
    plt.title(title_pca,fontproperties=times_new_roman)
    # 保存图像
    plt.savefig(str(filename_pca_pdf)+".pdf")
    plt.savefig(str(filename_pca_png)+".png",dpi=2000)
    plt.clf()
    ###### 创建不带图例的图形
    # 绘制散点图
    ax = sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette=id2color, size=0.2, legend=False)
    # 手动添加图例
    plt.title(title_pca)
    # 保存图像
    plt.savefig(str(filename_pca_pdf)+"_NonLabel.pdf")
    plt.savefig(str(filename_pca_png)+"_NonLabel.png",dpi=2000)
    # 显示图像
    #plt.show()
    plt.clf()

In [None]:
# 自定义title和文件名
title_svd = "Analyzing TruncatedSVD Components for DNA Sequence"
filename_svd_pdf = os.path.join(output_dir, f"{filename}_TruncatedSVD_visualization")
filename_svd_png = os.path.join(output_dir, f"{filename}_TruncatedSVD_visualization")

X_svd=np.load(os.path.join(output_dir, "linkage_matrix_SVD.npy"))
HERV_scatterplot(X_svd,labels, title_svd, filename_svd_pdf, filename_svd_png)

In [None]:
# 自定义title和文件名
title_pca = "Analyzing SparsePCA Components for DNA Sequence"

filename_pca_pdf = os.path.join(output_dir, f"{filename}_PCA_visualization")
filename_pca_png = os.path.join(output_dir, f"{filename}_PCA_visualization")

X_pca=np.load(os.path.join(output_dir, "linkage_matrix_PCA.npy"))
HERV_scatterplot(X_pca,labels, title_pca, filename_pca_pdf, filename_pca_png)

### 三维展示（TruncatedSVD）

In [None]:
#### SVD绘图的3D展示（C1，C2，C9）

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D
import seaborn as sns
import numpy as np


X_reduced=X_pca

filename_pca_pdf = os.path.join(output_dir, f"{filename}_TruncatedSVD_3d-visualization.pdf")
filename_pca_png = os.path.join(output_dir, f"{filename}_TruncatedSVD_3d-visualization.png")

# 获取 "deep" 调色板的前四个颜色
colors = sns.color_palette("deep", 4)

# 假设您有一个与序列对应的标签列表 Label1
id2color = {
    "0": colors[0],
    "1": colors[1],
    "2": colors[2],
    "3": colors[3]
}

# 假设 labels_raw 是您的原始标签列表
label_colors = np.array([id2color[str(label)] for label in labels_raw])  # 注意，这里转换成了numpy数组

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 8], c=label_colors, marker='o',alpha=1)

ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 9')   #Component 9

# 添加图例
legend_elements = [Line2D([0], [0], marker='o', color='w', label=id2label[key], 
                          markersize=10, markerfacecolor=color) for key, color in id2color.items()]
ax.legend(handles=legend_elements, loc='upper right')

# 减少图像的空白区域
plt.tight_layout()
# 如果需要，你还可以进一步手动调整图例的位置来减少空白
#plt.subplots_adjust(top=0.9,right=0.98,left=0.05)
plt.subplots_adjust(top=0.9)

plt.title('Analyzing TruncatedSVD Components for DNA Sequence')
plt.savefig(filename_pca_pdf)
plt.savefig(filename_pca_png)
#plt.show()

In [None]:
#### SVD绘图的3D展示

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D
import seaborn as sns
import numpy as np


X_reduced=X_pca

filename_pca_pdf = os.path.join(output_dir, f"{filename}_TruncatedSVD_3d-visualization_C3.pdf")
filename_pca_png = os.path.join(output_dir, f"{filename}_TruncatedSVD_3d-visualization_C3.png")

# 获取 "deep" 调色板的前四个颜色
colors = sns.color_palette("deep", 4)

# 假设您有一个与序列对应的标签列表 Label1
id2color = {"0": colors[0],"1": colors[1],"2": colors[2],"3": colors[3]}

# 假设 labels_raw 是您的原始标签列表
label_colors = np.array([id2color[str(label)] for label in labels_raw])  # 注意，这里转换成了numpy数组

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=label_colors, marker='o',alpha=1)

ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')   #Component 9

# 添加图例
legend_elements = [Line2D([0], [0], marker='o', color='w', label=id2label[key], 
                          markersize=10, markerfacecolor=color) for key, color in id2color.items()]
ax.legend(handles=legend_elements, loc='upper right')

# 减少图像的空白区域
plt.tight_layout()
# 如果需要，你还可以进一步手动调整图例的位置来减少空白
#plt.subplots_adjust(top=0.9,right=0.98,left=0.05)
plt.subplots_adjust(top=0.9)

plt.title('Analyzing TruncatedSVD Components for DNA Sequence')
plt.savefig(filename_pca_pdf)
plt.savefig(filename_pca_png)
#plt.show()

### pairplot绘图展示（TruncatedSVD）

In [None]:
#### pairplot绘图

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

filename_pca_pdf = os.path.join(output_dir, f"{filename}_TruncatedSVD_pairplot-visualization.pdf")
filename_pca_png = os.path.join(output_dir, f"{filename}_TruncatedSVD_pairplot-visualization.png")

# 载入数据和标签
X_reduced = np.load(filename)  # 假设 filename 是你的文件路径

# 创建一个 DataFrame，列是你的成分，你可以添加更多的列来表示更多的成分
df = pd.DataFrame(X_reduced, columns=[f'Component_{i}' for i in range(X_reduced.shape[1])])

# 添加标签列
df['labels'] = labels

# 定义颜色映射
palette = sns.color_palette("deep", len(np.unique(labels)))

# 绘制 pairplot
sns.pairplot(df, hue='labels', palette=palette, diag_kind='kde', plot_kws={'alpha': 0.6, 's': 80, 'edgecolor': 'k'})

# 设置标题
plt.suptitle('Analyzing TruncatedSVD Components for DNA Sequence', y=1.02)

# 添加图例
legend_elements = [Line2D([0], [0], marker='o', color='w', label=id2label[key], 
                          markersize=10, markerfacecolor=color) for key, color in id2color.items()]
ax.legend(handles=legend_elements, loc='upper right')

# 保存图像
plt.savefig(filename_pca_pdf, bbox_inches='tight')
plt.savefig(filename_pca_png, bbox_inches='tight')

# 显示图像
#plt.show()
