In [1]:
import fwdpy11
import numpy as np
import pandas as pd
import os
from itertools import chain
import gzip
from collections import Counter

In [2]:
def simplify_tree(ts, n=50):
    """
    Choose n random individuals from the ts
    and simplified to only those individuals
    Args:
        ts: tree sequence
        n: int number of individuals
    """
    ind_choosen = np.random.choice(ts.individuals(), n, replace=False)
    nodes_ids = []

    for x in ind_choosen:
        nodes_ids.extend(x.nodes)
        
    return ts.simplify(nodes_ids)

In [3]:
# Lets create a dict to keep the data


def get_sfs_for_variant_categories(ts_simplified):
    
    # constructor 
    mutations = {
        'neutral': [0, []],  # the second elemnt of the list will hold neutral mutations
        'missense': [1, []],
        'synonymous': [2, []],
        'LOF': [3, []],
    }

    labs = {
        0: 'neutral',
        1: 'missense',
        2: 'synonymous',
        3: 'LOF'
    }

    for m in ts_simplified.mutations():
        label = m.metadata['label']
        mutations[labs[label]][1].append(m)


    def get_sites(mut_type):
        """
        This function returns a tree having
        mutation only from mut_type
        Args:
            mut_type: str, neutral, LOF, etc.
        """
        to_exclude = list(set(mutations.keys()) - {mut_type})

        m_ex = []

        for x in to_exclude:
            m_ex.extend(mutations[x][1])

        ids_ex = [m.id for m in m_ex]
        return ts_simplified.delete_sites(ids_ex)


    ts_by_mutation = {x:get_sites(x) for x in mutations.keys()}
    sfs_by_mutation = {x: t.allele_frequency_spectrum(polarised=True, span_normalise=False)
                       for (x, t) in ts_by_mutation.items()}

    sfs = pd.DataFrame(sfs_by_mutation)
    sfs['DerivedFreq'] = list(range(sfs.shape[0]))

    return sfs.melt(id_vars=['DerivedFreq'], var_name='MutType', value_name='Frequency')

In [4]:
def load_pop(sim_id):
    inf = f'results/simulations/sim-seed-{sim_id}-pop.bin'
    return fwdpy11.DiploidPopulation.load_from_file(inf)

def id_exits(sim_id):
    inf = f'results/simulations/sim-seed-{sim_id}-pop.bin'
    return os.path.exists(inf)


In [5]:
def get_sfs(sim_id):
    pop = load_pop(sim_id)
    ts = pop.dump_tables_to_tskit()
    ts_simplified = simplify_tree(ts, 50)
    sf = get_sfs_for_variant_categories(ts_simplified)
    sf['sim_id'] = sim_id
    return sf

In [7]:
sf = [get_sfs(i) for i in range(1, 251) if id_exits(i)]
sf = pd.concat(sf)
sf

Unnamed: 0,DerivedFreq,MutType,Frequency,sim_id
0,0,neutral,0.0,1
1,1,neutral,232.0,1
2,2,neutral,74.0,1
3,3,neutral,72.0,1
4,4,neutral,21.0,1
...,...,...,...,...
399,96,LOF,0.0,250
400,97,LOF,0.0,250
401,98,LOF,0.0,250
402,99,LOF,0.0,250


In [8]:
sf = sf.groupby(['DerivedFreq', 'MutType']).sum()['Frequency'].reset_index()
sf.to_csv('results/SFSs_convoluted.csv', index=False)