### Small script which helps converting sample distribution to avg genome coverage

Perhaps we should also include the actual cluster coverage from the BAM files.

In [8]:
import glob
import pandas as pd
from Bio import SeqIO
import os

def generate_simulation_overview(dir_dataset:str):
    #Get genome sizes
    fp_combined_input_genomes = "../data/simulated_data/input_genomes/combined.fa" #TODO: fix hardcoded?
    genome_size = {}
    for record in SeqIO.parse(fp_combined_input_genomes, "fasta"):
        genome_size[record.name.replace(".","_")] = len(record.seq)
        
    #Get mapping between camisim id and ncbi ids
    fp_map = os.path.join(dir_dataset, "internal/genome_locations.tsv")
    cam2genom = {}
    for line in open(fp_map, "r"):
        genomeid, fp = line.strip().split("\t")
        ncbi_id = os.path.basename(fp).rsplit(".",1)[0]
        cam2genom[genomeid] = ncbi_id
        cam2genom[ncbi_id] = genomeid
        
    #Run over the samples and calculate the expected coverage from genome size and distribution.
    dfs = []
    for fp_dist in glob.glob(os.path.join(dir_dataset, "distributions/*")):
        df = pd.read_csv(fp_dist, sep="\t", names=["id", "distribution"])
        df["samle"] = "sample_" + fp_dist.rsplit("_",1)[1].replace(".txt","")
        df["ncbi"] = [cam2genom[x] for x in df.id]
        df["genome_size"] = [genome_size[x] for x in df.ncbi]
        df["weighted_dist"] = df.distribution * df.genome_size
        df["readshare"] = (df.weighted_dist/df.weighted_dist.sum())*(0.3*10**9)
        df["expected_average_coverage"] = df.readshare / df.genome_size

        dfs.append(df)
    df_total = pd.concat(dfs)
    return df_total

generate_simulation_overview("../data/simulated_data/camisim/0_4GB/")

Unnamed: 0,id,distribution,samle,ncbi,genome_size,weighted_dist,readshare,expected_average_coverage
0,Genome1,0.2,sample_0,NC_014328_1,4630065,926013.0,82386400.0,17.793788
1,Genome2,0.2,sample_0,NZ_CP020566_1,2071952,414390.4,36867880.0,17.793788
2,Genome3,0.2,sample_0,NZ_CP053893_1,5984367,1196873.4,106484600.0,17.793788
3,Genome4,0.2,sample_0,NZ_LT906445_1,2132142,426428.4,37938880.0,17.793788
4,Genome5,0.2,sample_0,NZ_LT906470_1,2041290,408258.0,36322280.0,17.793788
0,Genome1,0.2,sample_2,NC_014328_1,4630065,926013.0,82386400.0,17.793788
1,Genome2,0.2,sample_2,NZ_CP020566_1,2071952,414390.4,36867880.0,17.793788
2,Genome3,0.2,sample_2,NZ_CP053893_1,5984367,1196873.4,106484600.0,17.793788
3,Genome4,0.2,sample_2,NZ_LT906445_1,2132142,426428.4,37938880.0,17.793788
4,Genome5,0.2,sample_2,NZ_LT906470_1,2041290,408258.0,36322280.0,17.793788


In [17]:
float(os.path.basename("../data/simulated_data/camisim/0_4GB/".rstrip("/").replace("GB","").replace('_','.')))

0.4

In [67]:
def get_average_coverages(dir_datalabel:str):
    """
    cov_i = (pi/(sum(pj,sj)))*totalBp
    """
    fp = os.path.join(dir_datalabel, "internal/genome_locations.tsv")
    cam2genom = {}
    for line in open(fp, "r"):
        genomeid, fp = line.strip().split("\t")
        ncbi_id = fp.rsplit("/",1)[1].rsplit(".",1)[0]
        cam2genom[genomeid] = ncbi_id
        cam2genom[ncbi_id] = genomeid
    #for fp_dist in os.listdir()

In [81]:
cam2genom

{'Genome1': 'NC_014328_1',
 'NC_014328_1': 'Genome1',
 'Genome2': 'NZ_CP020566_1',
 'NZ_CP020566_1': 'Genome2',
 'Genome3': 'NZ_CP053893_1',
 'NZ_CP053893_1': 'Genome3',
 'Genome4': 'NZ_LT906445_1',
 'NZ_LT906445_1': 'Genome4',
 'Genome5': 'NZ_LT906470_1',
 'NZ_LT906470_1': 'Genome5'}

In [None]:
def get_genome_sizes(combined_fa: str="../data/simulated_data/input_genomes/combined.fa"):
    genome_size = {}
    for record in SeqIO.parse(combined_fa, "fasta"):
        genome_size[record.name.replace(".","_")] = len(record.seq)
    return genome_size
get_genome_sizes()

In [70]:
datalabel = "../data/simulated_data/camisim/0_4GB/"

In [94]:
fp = os.path.join(datalabel, "internal/genome_locations.tsv")
cam2genom = {}
for line in open(fp, "r"):
    genomeid, fp = line.strip().split("\t")
    ncbi_id = os.path.basename(fp).rsplit(".",1)[0]
    cam2genom[genomeid] = ncbi_id
    cam2genom[ncbi_id] = genomeid

g_sizes = get_genome_sizes()

In [96]:
import glob
dfs = []
for fp_dist in glob.glob(os.path.join(datalabel, "distributions/*")):
    df = pd.read_csv(fp_dist, sep="\t", names=["id", "distribution"])
    df["samle"] = "sample_" + fp_dist.rsplit("_",1)[1].replace(".txt","")
    df["ncbi"] = [cam2genom[x] for x in df.id]
    df["genome_size"] = [g_sizes[x] for x in df.ncbi]
    df["weighted_dist"] = df.distribution * df.genome_size
    df["readshare"] = (df.weighted_dist/df.weighted_dist.sum())*(0.3*10**9)
    df["expected_average_coverage"] = df.readshare / df.genome_size
    
    dfs.append(df)
df_total = pd.concat(dfs)
df_total

Unnamed: 0,id,distribution,samle,ncbi,genome_size,weighted_dist,readshare,expected_average_coverage
0,Genome1,0.2,sample_0,NC_014328_1,4630065,926013.0,82386400.0,17.793788
1,Genome2,0.2,sample_0,NZ_CP020566_1,2071952,414390.4,36867880.0,17.793788
2,Genome3,0.2,sample_0,NZ_CP053893_1,5984367,1196873.4,106484600.0,17.793788
3,Genome4,0.2,sample_0,NZ_LT906445_1,2132142,426428.4,37938880.0,17.793788
4,Genome5,0.2,sample_0,NZ_LT906470_1,2041290,408258.0,36322280.0,17.793788
0,Genome1,0.2,sample_2,NC_014328_1,4630065,926013.0,82386400.0,17.793788
1,Genome2,0.2,sample_2,NZ_CP020566_1,2071952,414390.4,36867880.0,17.793788
2,Genome3,0.2,sample_2,NZ_CP053893_1,5984367,1196873.4,106484600.0,17.793788
3,Genome4,0.2,sample_2,NZ_LT906445_1,2132142,426428.4,37938880.0,17.793788
4,Genome5,0.2,sample_2,NZ_LT906470_1,2041290,408258.0,36322280.0,17.793788


In [87]:
df

Unnamed: 0,id,distribution,samle,ncbi,genome_size
0,Genome1,0.2,sample_0,NC_014328_1,4630065
1,Genome2,0.2,sample_0,NZ_CP020566_1,2071952
2,Genome3,0.2,sample_0,NZ_CP053893_1,5984367
3,Genome4,0.2,sample_0,NZ_LT906445_1,2132142
4,Genome5,0.2,sample_0,NZ_LT906470_1,2041290
0,Genome1,0.2,sample_2,NC_014328_1,4630065
1,Genome2,0.2,sample_2,NZ_CP020566_1,2071952
2,Genome3,0.2,sample_2,NZ_CP053893_1,5984367
3,Genome4,0.2,sample_2,NZ_LT906445_1,2132142
4,Genome5,0.2,sample_2,NZ_LT906470_1,2041290


In [85]:
from Bio import SeqIO
import pandas as pd
def get_genome_sizes(combined_fa: str="../data/simulated_data/input_genomes/combined.fa"):
    genome_size = {}
    for record in SeqIO.parse(combined_fa, "fasta"):
        genome_size[record.name.replace(".","_")] = len(record.seq)
    return genome_size
get_genome_sizes()

{'NZ_LT906445_1': 2132142,
 'NZ_CP053893_1': 5984367,
 'NC_014328_1': 4630065,
 'NZ_CP020566_1': 2071952,
 'NZ_LT906470_1': 2041290}

In [65]:
fp = "../data/simulated_data/camisim/0_4GB/internal/genome_locations.tsv"
cam2genom = {}
for line in open(fp, "r"):
    genomeid, fp = line.strip().split("\t")
    ncbi_id = fp.rsplit("/",1)[1].rsplit(".",1)[0]
    cam2genom[genomeid] = ncbi_id
    cam2genom[ncbi_id] = genomeid   

In [61]:
df_cam2genom

Unnamed: 0,cam,fp
0,Genome1,/home/projects/dtu_00009/people/henspi/git/Scr...
1,Genome2,/home/projects/dtu_00009/people/henspi/git/Scr...
2,Genome3,/home/projects/dtu_00009/people/henspi/git/Scr...
3,Genome4,/home/projects/dtu_00009/people/henspi/git/Scr...
4,Genome5,/home/projects/dtu_00009/people/henspi/git/Scr...


In [6]:
g_size = get_genome_sizes()

In [7]:
g_size

{'NZ_LT906445.1': 2132142,
 'NZ_CP053893.1': 5984367,
 'NC_014328.1': 4630065,
 'NZ_CP020566.1': 2071952,
 'NZ_LT906470.1': 2041290}

In [52]:
probs = [0.2 for i in range(5)]
size = g_size.values()

In [53]:
weighted_sum = sum([prob*s for prob,s in zip(probs, size)])

In [57]:
weighted_sum

3371963.2

In [59]:
cov = [((p)/weighted_sum)*total for s, p in zip(size, probs)]
cov

[17.793788496861414,
 17.793788496861414,
 17.793788496861414,
 17.793788496861414,
 17.793788496861414]

703.5178707609531

In [37]:
total / sum(x*18 for x in g_size.values())

0.9885438053811896

In [25]:
total = 0.3*10**9
total

300000000.0

In [26]:
total/16859816

17.793788496861414