In [None]:
import gzip
import tqdm

In [None]:
major_minor_dict = {chrom: {} for chrom in snakemake.params['chrom']}
maf_prefix = snakemake.params["maf_prefix"]

for chrom in snakemake.params["chrom"]:
    glob_maf_path = f"{maf_prefix}/allSamples/{chrom}/{chrom}_allSamples_snps.mafs.gz"
    glob_mafs = gzip.open(glob_maf_path,'rb').readlines()

    for i, l in enumerate(glob_mafs):
        if i != 0:
            sl = l.strip().split(b"\t")
            pos = sl[1].decode("utf-8")
            REF = sl[2]
            ALT = sl[3]
            major_minor_dict[chrom][pos] = [REF, ALT]

In [None]:
def extract_positions(city, habitat):
    pos_index_dict = {chrom: {} for chrom in snakemake.params['chrom']}
    for chrom in snakemake.params["chrom"]:
        pos_path = f"{maf_prefix}/byCity/{city}/{chrom}/{city}_{habitat}_{chrom}_snps.pos.gz"
        with gzip.open(pos_path,'rb') as pos:
            lines = pos.readlines() 
            for i, l in enumerate(lines):
                if i != 0:
                    sl = l.strip().split(b"\t")
                    pos = sl[1].decode('utf-8')
                    pos_index_dict[chrom][pos] = i
    return pos_index_dict

city_pos_index_dict = {city: {hab: [] for hab in snakemake.params["habitats"]} for city in snakemake.params["cities"]}
for city in tqdm.tqdm(snakemake.params["cities"]):
    for hab in snakemake.params["habitats"]:
        city_pos_index_dict[city][hab] = extract_positions(city, hab)

In [None]:
missing_dict = {city: {hab: {chrom: [] for chrom in snakemake.params["chrom"]} for hab in snakemake.params["habitats"]} for city in snakemake.params["cities"]}

def map_global_site_to_city_pos_indices(city, hab):
    index_mapping_dict = {chrom: {} for chrom in snakemake.params['chrom']}
    for chrom in snakemake.params["chrom"]:
        for g_pos in major_minor_dict[chrom].keys():
            try:
                pos_idx = city_pos_index_dict[city][hab][chrom][g_pos]
                index_mapping_dict[chrom][g_pos] = pos_idx
            except KeyError:
                # print(f"{chrom}: {gp} missing from {hab} habitat in {city}")
                missing_dict[city][hab][chrom].append(g_pos)
    return index_mapping_dict

city_index_mapping_dict = {city: {hab: [] for hab in snakemake.params["habitats"]} for city in snakemake.params["cities"]}
for city in tqdm.tqdm(snakemake.params["cities"]):
    for hab in snakemake.params["habitats"]:
        city_index_mapping_dict[city][hab] = map_global_site_to_city_pos_indices(city, hab)

In [None]:
combined_missing_site_dict = {chrom: set() for chrom in snakemake.params['chrom']}

for city in tqdm.tqdm(snakemake.params["cities"]):
    for hab in snakemake.params["habitats"]:
        for chrom in snakemake.params["chrom"]:
            combined_missing_site_dict[chrom].update(missing_dict[city][hab][chrom])

In [None]:
def extract_allele_counts(city):
    for chrom in snakemake.params["chrom"]:
        urb_mafs_path = f"{maf_prefix}/byCity/{city}/{chrom}/{city}_urban_{chrom}_snps.mafs.gz"
        rur_mafs_path = f"{maf_prefix}/byCity/{city}/{chrom}/{city}_rural_{chrom}_snps.mafs.gz"
        
        urban_mafs = gzip.open(urb_mafs_path,'rb').readlines()
        rural_mafs = gzip.open(rur_mafs_path,'rb').readlines()
        
        for g_pos in major_minor_dict[chrom].keys():
            if g_pos in combined_missing_site_dict[chrom]:
                pass
            else:
                try:
                    urban_idx = city_pos_index_dict[city]["urban"][chrom].get(g_pos, None)
                    rural_idx = city_pos_index_dict[city]["rural"][chrom].get(g_pos, None)

                    urban_site = urban_mafs[urban_idx].strip().split(b"\t")
                    rural_site = rural_mafs[rural_idx].strip().split(b"\t")
                    
                    urban_af = float(urban_site[6].decode("UTF-8"))
                    rural_af = float(rural_site[6].decode("UTF-8"))
                    urban_nInd = int(urban_site[7].decode("UTF-8"))
                    rural_nInd = int(rural_site[7].decode("UTF-8"))
                    
                    urban_alt_count = round(urban_af * urban_nInd * 2)
                    urban_ref_count = (urban_nInd * 2) - urban_alt_count
                    rural_alt_count = round(rural_af * rural_nInd * 2)
                    rural_ref_count = (rural_nInd * 2) - rural_alt_count

                    # print(f"At {chrom}:{g_pos}")
                    # print(f"Urban: ALT AF of {urban_af} resulting in {urban_alt_count} ALT alleles and {urban_ref_count} REF alleles")
                    # print(f"Rural: ALT AF of {rural_af} resulting in {rural_alt_count} ALT alleles and {rural_ref_count} REF alleles")
                    # print("========================")
                    allele_count_dict[city].append([urban_ref_count, urban_alt_count, rural_ref_count, rural_alt_count])
                except IndexError:
                    print(f"{chrom}: {g_pos}")
                    break



In [None]:
allele_count_dict = {city: [] for city in snakemake.params["cities"]}
for city in tqdm.tqdm(snakemake.params["cities"]):
    extract_allele_counts(city)

In [None]:
num_rows = len(next(iter(allele_count_dict.values())))
    
with open(snakemake.output["as_geno"], 'w') as f:
    for row_idx in range(num_rows):
        row_data = []
        for key in allele_count_dict.keys():
            row_data.extend(allele_count_dict[key][row_idx])
        row_str = ' '.join(str(num) for num in row_data)
        f.write(f"{row_str}\n")

In [None]:
with open(snakemake.output["as_cont"], "w") as fout:
    fout.write(' '.join(["1", "-1"] * 26))

In [None]:
with open(snakemake.output["site_order"], "w") as fout:
    for chrom in snakemake.params["chrom"]:
        for gp in major_minor_dict[chrom].keys():
            if gp in combined_missing_site_dict[chrom]:
                pass
            else:
                fout.write(f"{chrom}\t{gp}\n")

In [None]:
with open(snakemake.output["miss"], "w") as fout:
    for city in tqdm.tqdm(snakemake.params["cities"]):
        for hab in snakemake.params["habitats"]:
            for chrom in snakemake.params["chrom"]:
                for pos in missing_dict[city][hab][chrom]:
                    # print(f"{chrom}: {pos} missing from {hab} habitat in {city}")
                    fout.write(f"{city}\t{hab}\t{chrom}\t{pos}\n")