# Observed VS Expected SFS

Here, I obtain the observed and expected 1d-SFS.

In [None]:
import moments
import demes
import gzip
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools

In [None]:
# Get the observed SFS
# Load the intergenic SNPs

data = '../../data/220113-ConstructBoostrapedDatasets/data/whole-genome/spectrum-cat_intronic.pkl.gz'

with gzip.open(data, "rb") as f:
    data = pickle.load(f)


In [None]:
N = 50
data = data.project([N]*4)

In [None]:
s = data.marginalize([1, 2, 3])

In [None]:
?moments.Spectrum.from_demes

In [None]:
# Get expected SFS under the model
mdl = '../220124-InfereModels/results/best-guest-NAT-EXPANSION-intronic.yml'
mdl = demes.load(mdl)


In [None]:

expected_sfs = moments.Spectrum.from_demes(
    mdl,
    sampled_demes=['YRI'],
    sample_sizes=[50]
)

In [None]:
def spectrum_fold_to_array(sf):
    '''Folds the expectrum and get a numpy array'''
    sf_folded = sf.fold()
    sf_folded = sf_folded[~sf_folded.mask].data
    return sf_folded


def to_frame(sfs_folded, pop, SFSfrom):
    '''
    Put the folded SFS in a pandas frame, with metadata info
    '''
    minor_alle_f = list(range(1, len(sfs_folded)+1))

    d = {
        'Population': pop,
        'SF_from': SFSfrom,
        'Frequency': sfs_folded,
        'Minor_allel_freq': minor_alle_f
    }
    return pd.DataFrame(d)


def get_observed_and_expected_sfs(pop):
    index = data.pop_ids.index(pop)
    indices = list(range(4))
    indices.pop(index)
    sf_data = data.marginalize(indices)
    
    ## expected SFS under the model
    sf_expected = moments.Spectrum.from_demes(
        mdl,
        sampled_demes=[pop],
        sample_sizes=[N]
    )

    sf_expected = moments.Inference.optimally_scaled_sfs(sf_expected, sf_data)
    
    ## fold and put results in a data frame
    s_data = spectrum_fold_to_array(sf_data)
    s_expected = spectrum_fold_to_array(sf_expected)

    s_data = to_frame(s_data, pop, 'Data')
    s_expected = to_frame(s_expected, pop, 'Expected')

    return pd.concat([s_data, s_expected])

In [None]:
results = pd.concat([get_observed_and_expected_sfs(x) for x in data.pop_ids])

In [None]:
results.to_csv('results/expected-observed-1dSFS.csv', index=False)