In [None]:
import seaborn as sns
import polars as pl
import plotly.express as px
from cyvcf2 import VCF
import random
import plotly.express as px
import polars as pl
import pandas as pd

import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook_connected"

# Load up metadata from "DNA from hoiho genomesv2.csv"
metadata = pl.read_csv(
    "Hoiho_Genomes_24Feb2024_JGG_3Pops.csv", separator="\t"
)

metadata = metadata.with_columns(
    pl.col("ID").replace("P29 ", "P29").alias("ID")
)

# Next is C101/CE9
metadata = metadata.with_columns(
    pl.col("ID").replace("C101/CE9", "CE9").alias("ID")
)

In [None]:
# Get samples found in bcf file using cyvcf2

bcf_file = "merged.a9.filtered.qual99.maf0.05.biallelic.bcf"

vcf = VCF(bcf_file)

samples = vcf.samples

In [None]:
# Keep only samples in the metadata that are also in the bcf file
pop_filtered = metadata.filter(pl.col("ID").is_in(samples))

In [None]:
# Get counts of each Population from the Population3 field
pop_counts = pop_filtered.group_by("Population3").len()
pop_counts

In [None]:
# So we need to subsample down to match the smallest population
# Let's make it even a little smaller so that way it's got some randomness as well

target_pop_size = 28

In [None]:
northern_samples = pop_filtered.filter(pl.col("Population3") == "Northern").get_column("ID").to_list()
enderby_samples = pop_filtered.filter(pl.col("Population3") == "Enderby").get_column("ID").to_list()
campbell_samples = pop_filtered.filter(pl.col("Population3") == "Campbell").get_column("ID").to_list()

In [None]:
[len(x) for x in [northern_samples, enderby_samples, campbell_samples]]

In [None]:
# Put into dict
pop_samples = {
    "Northern": northern_samples,
    "Enderby": enderby_samples,
    "Campbell": campbell_samples
}

In [None]:
def run_subset(pop_samples, target_pop_size, replicate):
    # Subsample each population down to target_pop_size
    subsampled_samples = {}
    for pop, samples in pop_samples.items():
        subsampled_samples[pop] = random.sample(samples, target_pop_size)

    # Then run bcftools view to subset for only those samples
    subset_bcf = f"subset_{target_pop_size}.bcf"
    subset_samples = [x for sublist in subsampled_samples.values() for x in sublist]
    subset_samples = ",".join(subset_samples)
    !bcftools view -O b -o {subset_bcf} -s {subset_samples} {bcf_file}
    
    # Then run plink2 PCA on the subset
    !pixi run plink2 --bcf {subset_bcf} --pca --out subset_{target_pop_size} --allow-extra-chr --vcf-half-call missing

    df = pl.read_csv("subset_28.eigenvec", separator="\t")
    df = df.to_pandas()

    # Merge with metadata
    df = df.merge(pop_filtered.to_pandas(), left_on="#IID", right_on="ID")

    fig = px.scatter(df, x="PC1", y="PC2", color="Population3", hover_data=["ID"])
    fig.write_image(f"pca_subset_rep{replicate}.png")


In [None]:
# Run 100 times
for i in range(100):
    run_subset(pop_samples, target_pop_size, i)