In [None]:
# Seaborn and polars
import seaborn as sns
import polars as pl
import plotly.express as px
import numpy as np

import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook_connected"

In [None]:
# Load up metadata from "DNA from hoiho genomesv2.csv"
metadata = pl.read_csv(
    "DNA from hoiho genomesv2_id_updated_10Oct2024.csv", separator="\t"
)
# C101/CE9 should just be called CE9
# P29 has a space at the end

# Let's fix that
# First to fix is P29
metadata = metadata.with_columns(
    pl.col("ID").replace("P29 ", "P29").alias("ID")
)

# Next is C101/CE9
metadata = metadata.with_columns(
    pl.col("ID").replace("C101/CE9", "CE9").alias("ID")
)

metadata

In [None]:
sparsity = pl.read_csv("sparsity", has_header=False, new_columns=["Contig", "ID"], separator="\t")

# How many individuals are there?
joined = metadata.join(sparsity, on="ID")

In [None]:
matrix = np.load("matrix.npy")

## Command
```
bcftools +check-sparsity --n-markers 100 merged.unfiltered.bcf > sparsity
```


In [None]:
# So we need a matrix, with individuals as rows and contigs as columns

# First, let's get the contigs
contigs = sparsity.select("Contig").unique().sort("Contig")
contigs = contigs.to_pandas()
contigs = contigs["Contig"].tolist()

# Now let's get the individuals
individuals = metadata.select("ID").unique().sort("ID")
individuals = individuals.to_pandas()
individuals = individuals["ID"].tolist()

# Now we need to fill in the matrix
# We'll start with a matrix of zeros

matrix = np.zeros((len(individuals), len(contigs)))

# Now we need to fill in the matrix
for i, individual in enumerate(individuals):
    # Make this faster, by filtering for ID first
    indiv_df = joined.filter(pl.col("ID") == individual)
    print("{}/{}".format(i, len(individuals)))

    for j, contig in enumerate(contigs):
        if indiv_df.filter(pl.col("Contig") == contig).shape[0] > 0:
            matrix[i, j] = 1

In [None]:
# np.save("matrix.npy", matrix)
# metadata

In [None]:
# Sort individuals by "Mainland vs Subs"
metadata = metadata.sort("Subs v Mainland")

# Update matrix to the new order
new_order = metadata.select("ID").to_pandas()
new_order = new_order["ID"].tolist()

old_order = individuals

# Reorder matrix
new_matrix = np.zeros((len(individuals), len(contigs)))

for i, individual in enumerate(new_order):
    idx = old_order.index(individual)
    new_matrix[i, :] = matrix[idx, :]

# Which is first and where is the border (index, numerically)

# First, let's get the border
border = metadata.filter(pl.col("Subs v Mainland") == "Mainland").shape[0]
border


In [None]:
# Plot as a heatmap for now
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(14, 14))
sns.heatmap(new_matrix)
plt.show()

In [None]:
np.shape(matrix.T)

In [None]:
# Do PCA of sparsity matrix and plot
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(matrix)

# Get the transformed data
transformed = pca.transform(matrix)

# Plot
px.scatter(x=transformed[:, 0], y=transformed[:, 1], color=individuals)

In [None]:
# [1]SEX        [2]Sample       [3]Predicted sex        [4]log P(Haploid)/nSites        [5]log P(Diploid)/nSites        [6]nSites       [7]Score: F < 0 < M ($4-$5)

guess_ploidy = pl.read_csv("guess_ploidy", has_header=False, separator="\t", skip_rows=3, new_columns=["_", "ID", "__", "log P(Haploid)/nSites", "log P(Diploid)/nSites", "nSites", "Score"])
guess_ploidy

In [None]:
px.scatter(y=guess_ploidy["log P(Haploid)/nSites"], color=guess_ploidy["ID"])

In [None]:
all_contigs = pl.read_csv("all_contigs", has_header=False)[:, 0].to_list()
# Remove the starting ">"
all_contigs = [contig[1:] for contig in all_contigs]
all_contigs[0:2]
len(all_contigs)

In [None]:
possible_sex_chrs = pl.read_csv("possible_sex_chrs", has_header=False)[:, 0].to_list()

# Remove possible sex chrs from all contigs
all_contigs = [contig for contig in all_contigs if contig not in possible_sex_chrs]
len(all_contigs)
               

In [None]:
# Save as non_sex_contigs, one item per line
with open("non_sex_contigs", "w") as f:
    for contig in all_contigs:
        f.write(contig + "\n")
