In [None]:
# Seaborn and polars
import seaborn as sns
import polars as pl
import plotly.express as px

import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook_connected"

In [None]:
# Load up the plink2 eigenvec
eigenvec = pl.read_csv("plink2.eigenvec", separator="\t")
eigenval = pl.read_csv("plink2.eigenval", separator="\t", has_header=False)
eigenvec

In [None]:
# Load up metadata from "DNA from hoiho genomesv2.csv"
metadata = pl.read_csv(
    "DNA from hoiho genomesv2_id_updated_10Oct2024.csv", separator="\t"
)
metadata

In [None]:
metadata = pl.read_csv("Hoiho_Genomes_Cleaned.csv", separator="\t")
metadata

In [None]:
# Join the two tables
# Rename eigenvec "#IID" to "ID" to match metadata
# DataFrame.rename(
#
# mapping: dict[str, str] | Callable[[str], str],
# *,
# strict: bool = True,
#
# ) â†’ DataFrame
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
joined = eigenvec.join(metadata, on="ID", how="inner")

In [None]:
joined.shape

In [None]:
# So still missing 2?
# Let's see what's missing
missing = eigenvec.join(metadata, on="ID", how="full")
missing = missing.filter(pl.col("ID").is_null())
missing


In [None]:
# C101/CE9 should just be called CE9
# P29 has a space at the end

# Let's fix that
# First to fix is P29
metadata = metadata.with_columns(
    pl.col("ID").replace("P29 ", "P29").alias("ID")
)

# Next is C101/CE9
metadata = metadata.with_columns(
    pl.col("ID").replace("C101/CE9", "CE9").alias("ID")
)

In [None]:
joined = eigenvec.join(metadata, on="ID", how="inner")
joined.shape

In [None]:
def plot_pca(data, eigenvals, which, color_by, title):
    x_var = "PC{}".format(which[0])
    y_var = "PC{}".format(which[1])

    x_axis_label = "{} ({:.2f}%)".format(x_var, eigenvals["column_1"][0])
    y_axis_label = "{} ({:.2f}%)".format(y_var, eigenvals["column_1"][1])

    title = "Hoiho Early - {} - {}".format(title, color_by)

    fig = px.scatter(
        data,
        x=x_var,
        y=y_var,
        hover_name=joined["ID"],
        color=color_by,
        labels={x_var: x_axis_label, y_var: y_axis_label},
        title=title,
    )

    return fig

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Subs v Mainland", title="Least Filtered")

In [None]:
plot_pca(joined, eigenval, which=[2, 3], color_by="Subs v Mainland", title="Least Filtered")

In [None]:
eigenvec = pl.read_csv("removed.eigenvec", separator="\t")
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
joined = eigenvec.join(metadata, on="ID", how="inner")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Sex", title="Two outliers removed")

In [None]:
plot_pca(joined, eigenval, which=[2, 3], color_by="Subs v Mainland", title="Two outliers removed")

In [None]:
eigenvec = pl.read_csv("removed_qual99.eigenvec", separator="\t")
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
joined = eigenvec.join(metadata, on="ID", how="inner")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Subs v Mainland", title="Quality 99")

In [None]:
plot_pca(joined, eigenval, which=[2, 3], color_by="Subs v Mainland", title="Quality 99")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Health status", title="Quality 99")

In [None]:
px.scatter_3d(
    x=joined["PC1"],
    y=joined["PC2"],
    z=joined["PC3"],
    hover_name=joined["ID"],
    color=joined["Subs v Mainland"],
    labels={"x": "PC1", "y": "PC2", "z": "PC3"},
    title="PCA of hoiho genomes",
)

In [None]:
# Now removed_qual99_gq99_fmissing0.1
eigenvec = pl.read_csv("removed_qual99_gq99_fmissing0.1.eigenvec", separator="\t")
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
joined = eigenvec.join(metadata, on="ID", how="inner")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Subs v Mainland", title="Quality 99, GQ 99, Fmissing 0.1")

In [None]:
plot_pca(joined, eigenval, which=[2, 3], color_by="Subs v Mainland", title="Quality 99, GQ 99, Fmissing 0.1")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Health status", title="Quality 99, GQ 99, Fmissing 0.1")

In [None]:
# Let's do removed_qual99_maf0.05.eigenvec
eigenvec = pl.read_csv("removed_qual99_maf0.05.eigenvec", separator="\t")
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
joined = eigenvec.join(metadata, on="ID", how="inner")

# Load up the eigenval
eigenval = pl.read_csv(
    "removed_qual99_maf0.05.eigenval", separator="\t", has_header=False
)

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Subs v Mainland", title="Quality 99, MAF 0.05")

In [None]:
plot_pca(joined, eigenval, which=[2, 3], color_by="Subs v Mainland", title="Quality 99, MAF 0.05")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Location", title="Quality 99, MAF 0.05")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Season", title="Quality 99, MAF 0.05")

In [None]:
eigenvec = pl.read_csv("no_sex_chrs.eigenvec", separator="\t")
eigenvec = eigenvec.rename(mapping={"#IID": "ID"})
eigenval = pl.read_csv("no_sex_chrs.eigenval", has_header=False)

joined = eigenvec.join(metadata, on="ID", how="inner")

In [None]:
plot_pca(joined, eigenval, which=[1, 2], color_by="Subs v Mainland", title="No Sex Chrs")