Skip to content

Commit

Permalink
Add a random genotype array simulator, and reorganize simulation into…
Browse files Browse the repository at this point in the history
… a 'sim' module
  • Loading branch information
jrm5100 committed Apr 22, 2021
1 parent 1d21b73 commit eebe6c0
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 6 deletions.
12 changes: 9 additions & 3 deletions pandas_genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@
except ModuleNotFoundError:
import importlib_metadata

from . import arrays, io, scalars
from . import arrays, io, scalars, sim
from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor
from .simulation import BAMS, SNPEffectEncodings, PenetranceTables

__version__ = importlib_metadata.version(__name__)

__all__ = [__version__, GenotypeSeriesAccessor, GenotypeDataframeAccessor, io, scalars]
__all__ = [
__version__,
GenotypeSeriesAccessor,
GenotypeDataframeAccessor,
io,
scalars,
sim,
]
File renamed without changes.
2 changes: 2 additions & 0 deletions pandas_genomics/sim/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .BAMS import BAMS, SNPEffectEncodings, PenetranceTables
from .random_gt import generate_random_gt
53 changes: 53 additions & 0 deletions pandas_genomics/sim/random_gt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import List

import numpy as np

from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
from pandas_genomics.scalars import Variant


def generate_random_gt(
variant: Variant, allele_freq: List[float], n: int = 1000, random_seed: int = 1855
) -> GenotypeArray:
"""
Simulate random genotypes according to the provided allele frequencies
Parameters
----------
variant: Variant
allele_freq: List[float]
Allele frequencies for each allele in the variant
n: int, default 1000
How many genotypes to simulate
random_seed: int, default 1855
Returns
-------
GenotypeArray
"""
# Validate frequencies
if len(allele_freq) != len(variant.alleles):
raise ValueError(
f"The number of provided frequencies ({len(allele_freq)}) doesn't match"
f" the number of alleles in the variant ({len(variant.alleles)})."
)
if sum(allele_freq) != 1.0:
raise ValueError(
f"The provided frequencies must add up to 1.0 (sum was {sum(allele_freq):.3f})"
)

# Choose gts
np.random.seed(random_seed)
genotypes = np.random.choice(
range(len(variant.alleles)), p=allele_freq, size=(n, variant.ploidy)
)

# Create GenotypeArray representation of the data
dtype = GenotypeDtype(variant)
scores = np.empty(n)
scores[:] = np.nan
data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
gt_array = GenotypeArray(values=data, dtype=dtype)

return gt_array
4 changes: 2 additions & 2 deletions tests/io/test_plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from pandas._testing import assert_frame_equal

from pandas_genomics import io, BAMS
from pandas_genomics import io, sim

DATA_DIR = Path(__file__).parent.parent / "data" / "plink"

Expand Down Expand Up @@ -40,7 +40,7 @@ def test_round_trip_sim(tmp_path):
d = tmp_path / "test"
d.mkdir()
output = str(d / "test")
data = BAMS().generate_case_control()
data = sim.BAMS().generate_case_control()
io.to_plink(
data,
output,
Expand Down
2 changes: 1 addition & 1 deletion tests/simulation/test_biallelic_sim.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pandas._testing import assert_frame_equal

from pandas_genomics import BAMS, SNPEffectEncodings, PenetranceTables
from pandas_genomics.sim import BAMS, SNPEffectEncodings, PenetranceTables


def assert_frame_not_equal(*args, **kwargs):
Expand Down
9 changes: 9 additions & 0 deletions tests/simulation/test_random.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pandas_genomics.scalars import Variant
from pandas_genomics import sim


def test():
var = Variant(chromosome="1", position=123456, ref="T", alt=["A"])
gta = sim.generate_random_gt(var, allele_freq=[0.7, 0.3])
var2 = Variant(chromosome="1", position=223456, ref="T", alt=["A", "C"])
gta_2 = sim.generate_random_gt(var2, allele_freq=[0.7, 0.25, 0.05])

0 comments on commit eebe6c0

Please sign in to comment.