Skip to content

Commit

Permalink
Add tests for loading files and fix an issue with default alleles
Browse files Browse the repository at this point in the history
  • Loading branch information
jrm5100 committed Oct 9, 2020
1 parent 0a21d04 commit 226e3e3
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 55 deletions.
6 changes: 4 additions & 2 deletions pandas_genomics/io/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,18 @@ def from_plink(bed_file: str):
for v_idx in range(num_variants):
variant_info_dict = variant_info.iloc[v_idx].to_dict()
variant_id = str(variant_info_dict['variant_id'])
a1 = str(variant_info_dict['allele1'])
a2 = str(variant_info_dict['allele2'])
variant = Variant(variant_id=variant_id,
chromosome=str(variant_info_dict['chromosome']),
coordinate=int(variant_info_dict['coordinate']),
alleles=[str(variant_info_dict['allele1']), str(variant_info_dict['allele2'])])
alleles=[a1, a2])
genotypes = []
chunk = f.read(chunk_size) # Encoded chunk of results for each variant
for byte in chunk:
# for each byte, get 2 bits at a time in reverse order (as a string, so '00', '01', '10', or '11')
bitstrings = [f"{byte:08b}"[i:i+2] for i in range(0, 8, 2)][::-1]
genotypes.extend([variant.make_genotype_from_plink_bits(bs) for bs in bitstrings])
genotypes.extend([variant.make_genotype_from_plink_bits(bs, a1, a2) for bs in bitstrings])
# Remove nonexistent samples at the end
genotypes = genotypes[:num_samples]
df[variant_id] = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
Expand Down
9 changes: 8 additions & 1 deletion pandas_genomics/scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,21 @@ def make_genotype_from_str(self, gt_str: str, sep: str = "/") -> 'Genotype':
a2 = self.get_allele_idx(allele2, add=True)
return Genotype(self, a1, a2)

def make_genotype_from_plink_bits(self, plink_bits: str) -> 'Genotype':
def make_genotype_from_plink_bits(self,
plink_bits: str,
allele1: str,
allele2: str) -> 'Genotype':
"""
Create a genotype from PLINK Bed file bits
Parameters
----------
plink_bits: str
A string with allele indices as encoded in plink format, one of {'00', '01', '10', '11'}
allele1: str
Allele corresponding to the first allele in the plink file
allele2: str
Allele corresponding to the second allele in the plink file
Returns
-------
Expand Down
66 changes: 14 additions & 52 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,21 @@
from pandas.tests.extension.conftest import *


# Below fixtures are copied from pandas.conftest
# They could be imported, but that would require having hypothesis as a dependency
@pytest.fixture(params=[None, lambda x: x])
def sort_by_key(request):
"""
Simple fixture for testing keys in sorting methods.
Tests None (no key) and the identity key.
"""
return request.param
def pytest_addoption(parser):
parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests"
)


@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
* >=
* >
* ==
* !=
* <
* <=
"""
return request.param
def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark test as slow to run")


_all_numeric_reductions = [
"sum",
"max",
"min",
"mean",
"prod",
"std",
"var",
"median",
"kurt",
"skew",
]


@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names.
"""
return request.param


_all_boolean_reductions = ["all", "any"]


@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names.
"""
return request.param
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
59 changes: 59 additions & 0 deletions tests/genotype_array/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest


# Below fixtures are copied from pandas.conftest
# They could be imported, but that would require having hypothesis as a dependency
@pytest.fixture(params=[None, lambda x: x])
def sort_by_key(request):
"""
Simple fixture for testing keys in sorting methods.
Tests None (no key) and the identity key.
"""
return request.param


@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
* >=
* >
* ==
* !=
* <
* <=
"""
return request.param


_all_numeric_reductions = [
"sum",
"max",
"min",
"mean",
"prod",
"std",
"var",
"median",
"kurt",
"skew",
]


@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names.
"""
return request.param


_all_boolean_reductions = ["all", "any"]


@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names.
"""
return request.param
21 changes: 21 additions & 0 deletions tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pathlib import Path
import pytest

from pandas_genomics import io

data_dir = Path(__file__).parent.parent / "data"


@pytest.fixture
def plink_small():
bed_file = data_dir / "plink" / "plink_test_small.bed"
result = io.from_plink(bed_file)
return result


@pytest.mark.slow
@pytest.fixture
def plink_medium():
bed_file = data_dir / "plink" / "plink_test_medium.bed"
result = io.from_plink(bed_file)
return result
14 changes: 14 additions & 0 deletions tests/io/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pytest


def test_loaded_small(plink_small):
"""Validate the small dataset"""
# TODO: Add more assertions
assert plink_small.shape == (150, 3020)


@pytest.mark.slow
def test_loaded_medium(plink_medium):
"""Validate the medium dataset"""
# TODO: Add more assertions
assert plink_medium.shape == (600, 45100)

0 comments on commit 226e3e3

Please sign in to comment.