In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import itertools
import plotly.express as px
import polars as pl

In [None]:
BCF_FILE = "../merged.a9.filtered.qual20_fmissing0.2.2alleles.snpsonly.pp6.19.n23_25_yep14.removed.bcf"

# Split BCF into pop assignments

In [None]:
pops_df = pl.read_csv("../pop_assignments.txt", separator=" " )
pops_df.head()

In [None]:
# Write output file for each pop
for pop in pops_df["Population3"].unique():
    pop_df = pops_df.filter(pl.col("Population3") == pop)
    # Only write out the sample ID, one per line
    pop_df.select("ID").write_csv(f"pop_samples_{pop}.txt", separator="\t", include_header=False)
    !bcftools view --threads 48 -S pop_samples_{pop}.txt --force-samples -o pop_{pop}.vcf.gz -Oz {BCF_FILE} -a -q 0.05:minor -U
    !vcftools --SNPdensity 100000 --gzvcf pop_{pop}.vcf.gz --out pop_{pop}

## Convert to Circos format

In [None]:
def convert_snpden_to_circos(pop_name, pop_snpden_file):
    snpden_df = pl.read_csv(pop_snpden_file, separator="\t")
    # Get the mean for snpden_df VARIANTS/KB
    print(f"Mean: {snpden_df['VARIANTS/KB'].mean()}")
    print(f"Std Dev: {snpden_df['VARIANTS/KB'].std()}")

    bin_size = 100_000
    # Take the largest mean from any pop, and the largest std.dev from any pop
    max_trim = 0.5863066764490095 + (2.5 * 0.5368753908840088)
    max_variant_count = 0
    # Iterate through the rows and convert to circos format
    with open(f"{pop_name}.snpden.txt", "w") as f:
        for i in range(0, snpden_df.height):
            # Export CHROM, BIN_START, BIN_START + bin_size, VARIANTS/KB
            chrom = snpden_df[i, "CHROM"]
            bin_start = snpden_df[i, "BIN_START"]
            bin_end = bin_start + bin_size
            variant_count = snpden_df[i, "VARIANTS/KB"]
            max_variant_count = max(max_variant_count, variant_count)
            if variant_count > max_trim:
                variant_count = max_trim
            variant_count = round(variant_count, 4)
            f.write(f"{chrom}\t{bin_start}\t{bin_end}\t{variant_count}\n")
    print(max_variant_count)

# Run for all 3 pops
convert_snpden_to_circos("pop_Northern", "pop_Northern.snpden")
convert_snpden_to_circos("pop_Campbell", "pop_Campbell.snpden")
convert_snpden_to_circos("pop_Enderby", "pop_Enderby.snpden")




In [None]:
# Snp density output should be like so:

# S1 0 100000 0.05
# S1 100000 200000 0.01
# S1 200000 300000 0.05
# S1 300000 400000 0.98
# S1 400000 500000 2
# S1 500000 600000 3.89
# S1 600000 700000 2.88
# S1 700000 800000 3.54
# S1 800000 900000 4.71

# Let's do 1 per population tho

## Fst

In [None]:
!vcftools --gzvcf ../merged.a9.filtered.qual20_fmissing0.2.2alleles.snpsonly.pp6.19.n23_25_yep14.removed.vcf.gz --weir-fst-pop pop_samples_Campbell.txt --weir-fst-pop pop_samples_Enderby.txt  --fst-window-size 100000 --out campbell_vs_enderby
!vcftools --gzvcf ../merged.a9.filtered.qual20_fmissing0.2.2alleles.snpsonly.pp6.19.n23_25_yep14.removed.vcf.gz --weir-fst-pop pop_samples_Campbell.txt --weir-fst-pop pop_samples_Northern.txt --fst-window-size 100000 --out campbell_vs_northern
!vcftools --gzvcf ../merged.a9.filtered.qual20_fmissing0.2.2alleles.snpsonly.pp6.19.n23_25_yep14.removed.vcf.gz --weir-fst-pop pop_samples_Enderby.txt  --weir-fst-pop pop_samples_Northern.txt --fst-window-size 100000 --out enderby_vs_northern

In [None]:
# Circos heatmap format is: S1      1666378 1766263 15

def process_fst_to_circos(pop_name, pop_file):
    fst_df = pl.read_csv(pop_file, separator="\t")
    cur_min = 9999999999.9
    cur_max = 0.0
    with open(f"{pop_name}.fst.circos.txt", "w") as f:
        for i in range(0, fst_df.height):
            # Output CHROM, BIN_START, BIN_END, WEIGHTED_FST
            f.write(f"{fst_df[i, 'CHROM']}\t{fst_df[i, 'BIN_START']}\t{fst_df[i, 'BIN_END']}\t{round(fst_df[i, 'WEIGHTED_FST'], 4)}\n")
            cur_max = max(cur_max, fst_df[i, 'WEIGHTED_FST'])
            cur_min = min(cur_min, fst_df[i, 'WEIGHTED_FST'])
    print(f"{pop_name} - Max: {cur_max}, Min: {cur_min}")

process_fst_to_circos("Campbell_vs_Northern", "campbell_vs_northern.windowed.weir.fst")
process_fst_to_circos("Campbell_vs_Enderby", "campbell_vs_enderby.windowed.weir.fst")
process_fst_to_circos("Enderby_vs_Northern", "enderby_vs_northern.windowed.weir.fst")

In [None]:
!vcftools --gzvcf pop_Campbell.vcf.gz --out campbell --het
!vcftools --gzvcf pop_Campbell.vcf.gz --out campbell --missing-site
!vcftools --gzvcf pop_Campbell.vcf.gz --out campbell --TajimaD 100000 
!vcftools --gzvcf pop_Campbell.vcf.gz --out campbell --site-pi 
!vcftools --gzvcf pop_Campbell.vcf.gz --out campbell --window-pi 100000

# pop_Enderby.vcf.gz and pop_Northern.vcf.gz
!vcftools --gzvcf pop_Enderby.vcf.gz --out enderby --het
!vcftools --gzvcf pop_Enderby.vcf.gz --out enderby --missing-site
!vcftools --gzvcf pop_Enderby.vcf.gz --out enderby --TajimaD 100000
!vcftools --gzvcf pop_Enderby.vcf.gz --out enderby --site-pi
!vcftools --gzvcf pop_Enderby.vcf.gz --out enderby --window-pi 100000

!vcftools --gzvcf pop_Northern.vcf.gz --out northern --het
!vcftools --gzvcf pop_Northern.vcf.gz --out northern --missing-site
!vcftools --gzvcf pop_Northern.vcf.gz --out northern --TajimaD 100000
!vcftools --gzvcf pop_Northern.vcf.gz --out northern --site-pi
!vcftools --gzvcf pop_Northern.vcf.gz --out northern --window-pi 100000

In [None]:
def process_tajimasd_to_circos(pop_name, pop_file):
    fst_df = pl.read_csv(pop_file, separator="\t")
    cur_max = 0.0
    cur_min = 9999999999.0
    with open(f"{pop_name}.tajimasd.circos.txt", "w") as f:
        for i in range(0, fst_df.height):
            # Output CHROM, BIN_START, BIN_END, WEIGHTED_FST
            f.write(f"{fst_df[i, 'CHROM']}\t{fst_df[i, 'BIN_START']}\t{fst_df[i, 'BIN_START']+100000}\t{round(fst_df[i, 'TajimaD'], 4)}\n")
            cur_max = max(cur_max, fst_df[i, 'TajimaD'])
            cur_min = min(cur_min, fst_df[i, 'TajimaD'])
    print(f"{pop_name} Tajima's D - Max: {cur_max}, Min: {cur_min}")

process_tajimasd_to_circos("Campbell", "campbell.Tajima.D")
process_tajimasd_to_circos("Enderby", "enderby.Tajima.D")
process_tajimasd_to_circos("Northern", "northern.Tajima.D")

In [None]:
fst = pd.read_csv("Fst_founders_offspring.windowed.weir.fst", sep="\t")
fst.head()

In [None]:
with open("founders_vs_offspring_fst.scatter", "w") as writer:
    
    for j,k in fst.iterrows():
        writer.write("\t".join(map(str, [k['CHROM'], k['BIN_START'], k['BIN_END'], k['MEAN_FST']])))
        writer.write("\n")
    #print(k)

In [None]:
[np.min(fst['MEAN_FST']), np.max(fst['MEAN_FST'])]

In [None]:
snpden = pd.read_csv("snpden.hist", sep="\s+", header=None)
[np.min(snpden[3]), np.max(snpden[3])]

In [None]:
roh_data = pd.read_csv("data/bcftoolsroh_founders", sep="\t", header=None, skiprows=4)
roh_data.head()

In [None]:
roh = defaultdict(int)

for j,row in roh_data.iterrows():
    roh[(row[2], row[3], row[4])] += 1

In [None]:
list(roh.keys())[15]

In [None]:
with open("founders_roh.heatmap", "w") as writer:
    for j, group in itertools.groupby(roh, lambda x: x[0]):
        roh_count = Counter()
        roh_positions = np.concatenate((*map(lambda x: np.arange(x[1], x[2]), group),))
        counts = Counter(roh_positions)

        intervals = defaultdict(int)

        istart = list(counts.keys())[0]
        iend = istart
        qty = list(counts.values())[0]

        for k in counts:
            v = counts[k]
            if k - iend > 1 or v != qty:
                intervals[(istart, iend)] = qty
                istart = k
                iend = k
                qty = v
            else:
                iend = k

        # And the final one...
        intervals[(istart, iend)] = qty

        for k,v in intervals.items():
            writer.write("\t".join(map(str, [j, k[0], k[1], v])))
            writer.write("\n")


In [None]:
roh_positions

In [None]:
counts = Counter(roh_positions)

intervals = defaultdict(int)

istart = list(counts.keys())[0]
iend = istart
qty = list(counts.values())[0]

for k in counts:
    v = counts[k]
    if k - iend > 1 or v != qty:
        intervals[(istart, iend)] = qty
        istart = k
        iend = k
        qty = v
    else:
        iend = k

# And the final one...
intervals[(istart, iend)] = qty

In [None]:
with open("founders_roh.heatmap", "w") as writer:
    for k,v in intervals.items():
        writer.write("\t".join(map(str, ["S1", k[0], k[1], v])))
        writer.write("\n")


In [None]:
np.max(list(intervals.values()))

In [None]:
roh_data = pd.read_csv("data/bcftoolsroh_offspring", sep="\t", header=None, skiprows=4)
roh_data.head()
roh = defaultdict(int)

for j,row in roh_data.iterrows():
    roh[(row[2], row[3], row[4])] += 1

In [None]:
with open("offspring_roh.heatmap", "w") as writer:
    for j, group in itertools.groupby(roh, lambda x: x[0]):
        roh_count = Counter()
        roh_positions = np.concatenate((*map(lambda x: np.arange(x[1], x[2]), group),))
        counts = Counter(roh_positions)

        intervals = defaultdict(int)

        istart = list(counts.keys())[0]
        iend = istart
        qty = list(counts.values())[0]

        for k in counts:
            v = counts[k]
            if k - iend > 1 or v != qty:
                intervals[(istart, iend)] = qty
                istart = k
                iend = k
                qty = v
            else:
                iend = k

        # And the final one...
        intervals[(istart, iend)] = qty

        for k,v in intervals.items():
            writer.write("\t".join(map(str, [j, k[0], k[1], v])))
            writer.write("\n")



counts = Counter(roh_positions)
intervals = defaultdict(int)

istart = list(counts.keys())[0]
iend = istart
qty = list(counts.values())[0]

for k in counts:
    v = counts[k]
    if k - iend > 1 or v != qty:
        intervals[(istart, iend)] = qty
        istart = k
        iend = k
        qty = v
    else:
        iend = k

# And the final one...
intervals[(istart, iend)] = qty

with open("offspring_roh.heatmap", "w") as writer:
    for k,v in intervals.items():
        writer.write("\t".join(map(str, ["S1", k[0], k[1], v])))
        writer.write("\n")
