#### representation visualization cases

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import itertools

import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_embedding(data, label, title):
    x_min, x_max = np.min(data, 0), np.max(data, 0)
    data = (data - x_min) / (x_max - x_min)

    fig = plt.figure()
    ax = plt.subplot(111)
    for i in range(data.shape[0]):
        plt.text(data[i, 0], data[i, 1], str(label[i]),
                 color=plt.cm.Set1(label[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})
    plt.xticks([])
    plt.yticks([])
    plt.title(title)
    return fig

# Function to extract SNP position from variant_id
def extract_snp_position(variant_id):
    return int(variant_id.split('_')[1])  # Extract numeric position

# Function to find mismatched label pairs sorted by SNP position difference
def find_mismatched_label_pairs(df, max_pairs=50):
    pairs_dict = {}
    pairs = []

    for (idx1, row1), (idx2, row2) in itertools.combinations(df.iterrows(), 2):
        chr1, chr2 = row1['variant_id'].split('_')[0], row2['variant_id'].split('_')[0]

        # Check if they are on the same chromosome but have different labels
        if chr1 == chr2 and row1['label'] != row2['label']:
            pos1 = extract_snp_position(row1['variant_id'])
            pos2 = extract_snp_position(row2['variant_id'])
            distance = abs(pos1 - pos2)  # Sort based on SNP position difference
            pairs.append((idx1, idx2, row1['variant_id'], row2['variant_id'], distance))

    # Sort by SNP position difference and take the closest 10 pairs
    pairs = sorted(pairs, key=lambda x: x[4])[:max_pairs]

    # Store results in dictionary
    for i, (idx1, idx2, var1, var2, dist) in enumerate(pairs):
        pairs_dict[f"pair_{i+1}"] = {
            "index_1": idx1,
            "index_2": idx2,
            "variant_1": var1,
            "variant_2": var2,
            "snp_distance": dist  # Changed key name to reflect SNP position difference
        }

    return pairs_dict, pairs

In [None]:
# load datasets
raw_data = pd.read_pickle('../../datasets/small/train_small_post.pkl')[['phenotype_id','variant_id','tss_distance','bulk','label']]
print(raw_data.head())
input = np.load("../../model/middle_output/small/input.npy",allow_pickle=True) 
input2 = np.load("../../model/middle_output/small/input2.npy",allow_pickle=True) 
output = np.load("../../model/middle_output/small/output.npy",allow_pickle=True) 
label = np.load("../../model/middle_output/small/label.npy",allow_pickle=True) 
label = np.argmax(label, axis=-1) # (4302,)
print(label.shape)

In [None]:
stack = input

fig = plt.figure()
tsne = TSNE(n_components=2, init='pca', random_state=0)
print(stack.shape)
result = tsne.fit_transform(stack)
print(result.shape)
#fig = plot_embedding(result, label,'t-SNE embedding of the digits')
x_min, x_max = np.min(result, 0), np.max(result, 0)
result = (result - x_min) / (x_max - x_min)

color = ["#9392BE","#D5E49B"]
#color = ["#B0E0E6","#EE6363"]

ax = plt.subplots(figsize=(5,5))
for i in range(result.shape[0]):
    if(label[i] == 1):
        s2 = plt.scatter(result[i, 0], result[i, 1],s=2,color=color[label[i]])
for i in range(result.shape[0]):
    if(label[i] == 0):
        s1 = plt.scatter(result[i, 0], result[i, 1],s=2,color=color[label[i]])
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Between Branch (Small Model)')
plt.legend((s1,s2),('Down-regulation','Up-regulation') ,loc = 'best')
plt.savefig('../images/tsne/case/input-small.png',dpi=300, bbox_inches = 'tight')
plt.show()

In [None]:
fig = plt.figure()
tsne = TSNE(n_components=2, init='pca', random_state=0)
print(output.shape)
result = tsne.fit_transform(output)
print(result.shape)
#fig = plot_embedding(result, label,'t-SNE embedding of the digits')
x_min, x_max = np.min(result, 0), np.max(result, 0)
result = (result - x_min) / (x_max - x_min)

color = ["#9392BE","#D5E49B"]
#color = ["#B0E0E6","#EE6363"]

ax = plt.subplots(figsize=(5,5))
for i in range(result.shape[0]):
    if(label[i] == 1):
        s2 = plt.scatter(result[i, 0], result[i, 1],s=2,color=color[label[i]])
for i in range(result.shape[0]):
    if(label[i] == 0):
        s1 = plt.scatter(result[i, 0], result[i, 1],s=2,color=color[label[i]])
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('the final FC layer (Small Model)')
#plt.legend((s1,s2),('Down-regulation','Up-regulation') ,loc = 'best')
plt.savefig('../images/tsne/case/output-small.png',dpi=300, bbox_inches = 'tight')
plt.show()

In [None]:
# Find 10 mismatched label pairs
mismatch_pairs_dict, mismatch_pairs_list = find_mismatched_label_pairs(raw_data)

# Perform t-SNE on input vectors
tsne = TSNE(n_components=2, init='pca', random_state=0)
result = tsne.fit_transform(input)

# Normalize for better visualization
x_min, x_max = np.min(result, 0), np.max(result, 0)
result = (result - x_min) / (x_max - x_min)

# Define colors
color = ["#9392BE", "#D5E49B"]  # Normal points (background points)
pair_colors = ["#800000", "#2F4F4F"]  # Two distinct blue shades for each pair

# Function to format SNP annotation
def format_variant(variant_id, bulk):
    parts = variant_id.split('_')
    return f"{parts[0]}:{parts[1]}:{parts[2]}:{parts[3]}\n{bulk}"

# Plot each pair separately
for i, (idx1, idx2, var1, var2, _) in enumerate(mismatch_pairs_list):
    plt.figure(figsize=(5, 5))
    
    # Plot all data points
    for j in range(result.shape[0]):
        plt.scatter(result[j, 0], result[j, 1], s=2, color=color[raw_data.loc[j, "label"]], alpha=0.5)

    # Highlight the pair with two different blue colors
    plt.scatter(result[idx1, 0], result[idx1, 1], s=40, color=pair_colors[0], edgecolors='black', linewidth=0.8, label=var1)
    plt.scatter(result[idx2, 0], result[idx2, 1], s=40, color=pair_colors[1], edgecolors='black', linewidth=0.8, label=var2)

    # Extract SNP annotation with bulk
    annotation_1 = format_variant(var1, raw_data.loc[idx1, "bulk"])
    annotation_2 = format_variant(var2, raw_data.loc[idx2, "bulk"])

    # Add text labels (Right-up for idx1, Left-up for idx2)
    plt.text(result[idx1, 0] + 0.02, result[idx1, 1] + 0.02, annotation_1,
             fontsize=10, color=pair_colors[0], fontweight='bold', ha='left')
    plt.text(result[idx2, 0] + 0.02, result[idx2, 1] - 0.02, annotation_2,
             fontsize=10, color=pair_colors[1], fontweight='bold', ha='left')

    # Labels and title
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title(f'Between Branch (Small Model)\n(SNPs with Different Labels on the Same Chromosome)', fontsize=12)
    #plt.legend(loc='best')

    # Save each pair plot
    plt.savefig(f'../images/tsne/case/pair_{i+1}.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Find 10 mismatched label pairs
mismatch_pairs_dict, mismatch_pairs_list = find_mismatched_label_pairs(raw_data)

# Perform t-SNE on input vectors
tsne = TSNE(n_components=2, init='pca', random_state=0)
result = tsne.fit_transform(output)

# Normalize for better visualization
x_min, x_max = np.min(result, 0), np.max(result, 0)
result = (result - x_min) / (x_max - x_min)

# Define colors
color = ["#9392BE", "#D5E49B"]  # Normal points (background points)
pair_colors = ["#800000", "#2F4F4F"]  # Two distinct blue shades for each pair

# Function to format SNP annotation
def format_variant(variant_id, bulk):
    parts = variant_id.split('_')
    return f"{parts[0]}:{parts[1]}:{parts[2]}:{parts[3]}\n{bulk}"

# Plot each pair separately
for i, (idx1, idx2, var1, var2, _) in enumerate(mismatch_pairs_list):
    plt.figure(figsize=(5, 5))
    
    # Plot all data points
    for j in range(result.shape[0]):
        plt.scatter(result[j, 0], result[j, 1], s=2, color=color[raw_data.loc[j, "label"]], alpha=0.5)

    # Highlight the pair with two different blue colors
    plt.scatter(result[idx1, 0], result[idx1, 1], s=40, color=pair_colors[0], edgecolors='black', linewidth=0.8, label=var1)
    plt.scatter(result[idx2, 0], result[idx2, 1], s=40, color=pair_colors[1], edgecolors='black', linewidth=0.8, label=var2)

    # Extract SNP annotation with bulk
    annotation_1 = format_variant(var1, raw_data.loc[idx1, "bulk"])
    annotation_2 = format_variant(var2, raw_data.loc[idx2, "bulk"])

    # Add text labels (Right-up for idx1, Left-up for idx2)
    plt.text(result[idx1, 0] + 0.02, result[idx1, 1] + 0.02, annotation_1,
             fontsize=10, color=pair_colors[0], fontweight='bold', ha='left')
    plt.text(result[idx2, 0] + 0.02, result[idx2, 1] - 0.02, annotation_2,
             fontsize=10, color=pair_colors[1], fontweight='bold', ha='left')

    # Labels and title
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title(f'the final FC layer Visualization\n(SNPs with Different Labels on the Same Chromosome)', fontsize=12)
    #plt.legend(loc='best')

    # Save each pair plot
    plt.savefig(f'../images/tsne/case/output_pair_{i+1}.png', dpi=300, bbox_inches='tight')
    plt.show()