In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import plotly.express as px
from scipy.spatial.distance import pdist, squareform
from itertools import combinations

In [None]:
data_dir = Path("./data")

In [None]:
%pwd

# PCA based on filtered SNP data

In [None]:
# Read the eigenvector file without headers
pca = pd.read_csv(data_dir/"dme-ldpruned.eigenvec", 
                 sep='\s+',  # This handles any whitespace delimiter
                 header=None)           # No column names

eigenval = np.loadtxt(data_dir/"dme-ldpruned.eigenval")

In [None]:
pca = pca.iloc[:, 1:]

# # Remaining columns to PC1, PC2, etc.

pc_names = ["PC" + str(i) for i in range(1, pca.shape[1])]
pca.columns = ['sample_id'] + pc_names
df = pca.set_index("sample_id").copy()

pca['where'] = [x[0] for x in pca.sample_id.values]

px.scatter(pca, x='PC1', y='PC2', width=600, height=600, color='where')

In [None]:

def calculate_distnaces(df):
# Calculate distances
    pc_coords = df[['PC1', 'PC2']].values
    distances = pdist(pc_coords, metric='euclidean')

    # Create pairwise combinations (no repetition)
    sample_pairs = list(combinations(df.index, 2))

    # Create 3-column DataFrame
    distance_pairs = pd.DataFrame({
        'Sample1': [pair[0] for pair in sample_pairs],
        'Sample2': [pair[1] for pair in sample_pairs], 
        'Distance': distances
    })

    return distance_pairs

In [None]:
distance_pairs = calculate_distnaces(df)

In [None]:
#distance_pairs.to_csv(data_dir/"2025-09-15_GDL_SNP_PCA_distances.csv", index=False)

# PCA based on deletion data

In [None]:
pca = pd.read_csv(data_dir/"gdl_indels_filtered_pca.eigenvec", 
                 sep='\s+',  
                 )           
eigenval = np.loadtxt(data_dir/"gdl_indels_filtered_pca.eigenval")


# Remove the first column (equivalent to pca[,-1] in R)
pca = pca.iloc[:, 1:]

pc_names = ["PC" + str(i) for i in range(1, pca.shape[1])]
pca.columns = ['sample_id'] + pc_names
df = pca.set_index("sample_id").copy()

pca['where'] = [x[0] for x in pca.sample_id.values]

px.scatter_3d(pca, x='PC1', y='PC2', z='PC3', width=600, height=600, 
              color='where', hover_data=['sample_id'])


In [None]:
distance_pairs = calculate_distnaces(df)
#distance_pairs.to_csv("../data/2025-10-03_GDL_INDEL_PCA_distances.csv", index=False)

In [None]:
pd.DataFrame([pc_names, eigenval], index=['PC', 'var_explained']).T#.to_csv("../data/2025-10-03_GDL_INDEL_PCA_var.csv", index=False)