Skip to content

Commit

Permalink
Merge pull request #14 from jrm5100/add_bialleleic_simulation
Browse files Browse the repository at this point in the history
Add BAMS and `to_plink`
  • Loading branch information
jrm5100 committed Apr 22, 2021
2 parents 5d6e9df + c3f423b commit 14007af
Show file tree
Hide file tree
Showing 12 changed files with 768 additions and 29 deletions.
1 change: 1 addition & 0 deletions pandas_genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from . import arrays, io, scalars
from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor
from .simulation import BAMS, SNPEffectEncodings, PenetranceTables

__version__ = importlib_metadata.version(__name__)

Expand Down
5 changes: 3 additions & 2 deletions pandas_genomics/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
:toctree: io
from_plink
to_plink
from_vcf
"""

from .plink import from_plink
from .plink import from_plink, to_plink
from .vcf import from_vcf

__all__ = ["from_plink", "from_vcf"]
__all__ = ["from_plink", "to_plink", "from_vcf"]
2 changes: 2 additions & 0 deletions pandas_genomics/io/plink/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .from_plink import from_plink
from .to_plink import to_plink
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
from pathlib import Path
from typing import Optional
from typing import Optional, Union

import pandas as pd
import numpy as np
from ..arrays import GenotypeDtype, GenotypeArray
from ..scalars import Variant, MISSING_IDX
from ...arrays import GenotypeDtype, GenotypeArray
from ...scalars import Variant, MISSING_IDX


def from_plink(
bed_file: str, swap_alleles: bool = False, max_variants: Optional[int] = None
input: Union[str, Path],
swap_alleles: bool = False,
max_variants: Optional[int] = None,
categorical_phenotype: bool = True,
):
"""
Load genetic data from plink v1 files (.bed, .bim, and .fam) into a DataFrame.
Parameters
----------
bed_file: str or Path
PLINK .bed file. .bim and .fam files with the same name and location must also exist.
input: str or Path
PLINK sample (no extension): .bed, .bim and .fam files with the same name and location must exist.
swap_alleles: bool
False by default, in which case "allele2" (usually major) in the bim file is considered the "reference" allele.
If True, "allele1" (usually minor) is considered the "reference" allele.
max_variants: Optional[int]
If provided, only load this number of variants
categorical_phenotype: bool, True by default
If True, the phenotype is encoded as a categorical when loaded (1 = "Control", 2 = "Case", otherwise missing.
If False, load values directly.
Returns
-------
Expand All @@ -37,10 +43,10 @@ def from_plink(
--------
"""
# All three files are used
bed_file = Path(bed_file)
name = bed_file.stem
bim_file = bed_file.parent / f"{name}.bim"
fam_file = bed_file.parent / f"{name}.fam"
input = str(input) # coerce to string in order to add extensions
bed_file = Path(input + ".bed")
bim_file = Path(input + ".bim")
fam_file = Path(input + ".fam")

# Make sure each file exists
if not Path(bed_file).exists():
Expand All @@ -53,7 +59,7 @@ def from_plink(
print(f"Loading genetic data from '{bed_file.stem}'")

# Load fam file
df = load_sample_info(fam_file)
df = load_sample_info(fam_file, categorical_phenotype)
# Lod bim file
variant_list = load_variant_info(bim_file, max_variants) # Load bed file
gt_array_dict = load_genotypes(
Expand All @@ -67,14 +73,19 @@ def from_plink(
return df


def load_sample_info(fam_file):
def load_sample_info(fam_file, categorical_phenotype):
"""Load fam file (PLINK sample information file) into a df"""
df = pd.read_table(fam_file, header=None, sep=" ")
df.columns = ["FID", "IID", "IID_father", "IID_mother", "sex", "phenotype"]
# Update 'sex'
df["sex"] = df["sex"].astype("category")
df["sex"] = df["sex"].cat.rename_categories({1: "male", 2: "female", 0: "unknown"})
# 'Phenotype' is not encoded because it may be more complicated than just case/control
# Encode the phenotype
DEFAULT_CAT_MAP = {1: "Control", 2: "Case"}
if categorical_phenotype:
df["phenotype"] = df["phenotype"].astype("category")
df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP, inplace=True)
df.loc[~df["phenotype"].isin(DEFAULT_CAT_MAP.values()), "phenotype"] = None
print(f"\tLoaded information for {len(df)} samples from '{fam_file.name}'")
return df

Expand Down Expand Up @@ -117,8 +128,13 @@ def create_variant(variant_info_row):
a1 = None
else:
a1 = [a1] # pass as list
# Ensure chromosome is None instead of nan
if np.isnan(variant_info_row["chromosome"]):
chromosome = None
else:
chromosome = str(variant_info_row["chromosome"])
variant = Variant(
chromosome=str(variant_info_row["chromosome"]),
chromosome=chromosome,
position=int(variant_info_row["coordinate"]),
id=variant_id,
ref=a2,
Expand Down
210 changes: 210 additions & 0 deletions pandas_genomics/io/plink/to_plink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from typing import Optional

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from pandas_genomics.arrays import GenotypeDtype


def to_plink(
data: pd.DataFrame,
output: str,
phenotype_name: Optional[str] = None,
phenotype_case: Optional[str] = None,
phenotype_control: Optional[str] = None,
id_prefix: str = "sample",
):
"""
Save genetic data to plink v1 files (.bed, .bim, and .fam)
Parameters
----------
data: pd.DataFrame
DataFrame containing GenotypeArrays to be saved.
output: str
Name to use for the output .bed, .bim, and .fam files
phenotype_name: str, default None
Optional column in `data` to be saved as the phenotype value in the .fam file.
phenotype_case, phenotype_control
String values indicating the category to be used as "case" or "control" for binary phenotypes.
If provided, the phenotype must be categorical.
If not provided, the phenotype is not encoded at all (assumed to be quantitative).
id_prefix:
If the data index is an integer index, this prefix will be added to generate IDs.
Notes
-----
If the data index has the required columns (FID, IID, IID_father, IID_mother, sex, phenotype) the fam file will
be created based on the index.
If a phenotype name is provided, this will override any phenotype information in the index.
If the data has a single index column this will be used (with the prefix) for FID and IID. Defaults will be used for other .fam data
"""
print(f"Saving genotype data to {output}")
save_fam(
data,
output + ".fam",
phenotype_name,
phenotype_case,
phenotype_control,
id_prefix,
)
save_bim(data, output + ".bim")
save_bed(data, output + ".bed")


def save_fam(
data, output_fam, phenotype_name, phenotype_case, phenotype_control, id_prefix
):
# Validate phenotype, if provided
if phenotype_name is not None:
if phenotype_name not in data.columns:
raise ValueError(
f"The phenotype ({phenotype_name}) was not found in the data"
)
else:
phenotype_data = data[phenotype_name]

# Check for full plink-style index, or create one
if data.index.names == [
"FID",
"IID",
"IID_father",
"IID_mother",
"sex",
"phenotype",
]:
# Recode sex
fam_data = data.index.to_frame()
fam_data["sex"].cat.rename_categories(
{"male": 1, "female": 2, "unknown": 0}, inplace=True
)
# Update phenotype if provided
if phenotype_name is not None:
fam_data["phenotype"] = phenotype_data
elif len(data.index.names) == 1:
ids = pd.Series(data.index.values).apply(lambda i: f"{id_prefix}{i}")
fam_data = pd.DataFrame.from_dict(
{
"FID": ids,
"IID": ids,
"IID_father": np.zeros(len(data)).astype(int),
"IID_mother": np.zeros(len(data)).astype(int),
"sex": np.zeros(len(data)).astype(int),
}
)
if phenotype_name is None:
fam_data["phenotype"] = np.ones(len(data)) * -9 # -9 is missing
else:
fam_data["phenotype"] = phenotype_data
else:
raise ValueError(
"The data index must contain all 6 .fam file columns, or a single column"
)

# Optionally Encode phenotype
if (phenotype_control is not None) and (phenotype_case is not None):
if fam_data["phenotype"].dtype.name != "category":
raise ValueError(
"The phenotype must be categorical to utilize 'phenotype_control' and 'phenotype_case' parameters"
)
pheno_dict = {phenotype_control: 1, phenotype_case: 2}
fam_data["phenotype"].cat.rename_categories(
lambda c: pheno_dict.get(c, 0), inplace=True
)

fam_data.to_csv(output_fam, sep=" ", header=False, index=False)
print(f"\tSaved information for {len(fam_data)} samples to {output_fam}")


def save_bim(data, output_bim):
variants = [
col_val.genomics.variant
for col_name, col_val in data.iteritems()
if GenotypeDtype.is_dtype(col_val.dtype)
]
for var in variants:
if len(var.alleles) != 2:
raise ValueError(
f"Variant {var.id} is not bialleleic (it has {len(var.alleles)} alleles) and therefore can't be saved in plink format."
)
var_dicts = [
{
"chromosome": var.chromosome,
"variant_id": var.id,
"position": 0,
"coordinate": var.position,
"allele1": var.alleles[1], # alt
"allele2": var.alleles[0], # ref
}
for var in variants
]
bim_data = pd.DataFrame(var_dicts)
bim_data.to_csv(output_bim, sep="\t", header=False, index=False)
print(f"\tSaved information for {len(bim_data)} variants to {output_bim}")


def save_bed(data, output_bed):
# Get an array of bytes for each variant
bytes = np.array(
[
gt_array_to_plink_bits(col_val)
for col_name, col_val in data.iteritems()
if GenotypeDtype.is_dtype(col_val.dtype)
]
)
# flatten into a single array
bytes = bytes.flatten()
# Add the first 3 bytes
CORRECT_FIRST_BYTES = np.array([108, 27, 1], dtype="uint8")
bytes = np.concatenate([CORRECT_FIRST_BYTES, bytes])
# Write to file
bytes.tofile(output_bed)
print(f"\tSaved genotypes to {output_bed}")


def gt_array_to_plink_bits(gt_series):
allele_ids = gt_series.array.allele_idxs
# Replace missing with 0,1
missing = gt_series.array.is_missing
allele_ids[missing] = (0, 1)
# Replace het with 1,0
het_gt = allele_ids.sum(axis=1) == 1
allele_ids[het_gt] = (1, 0)
# Pad with zeros so it is divisible by 4
pad_samples = len(allele_ids) % 4
allele_ids = np.vstack([allele_ids, np.zeros(shape=(pad_samples, 2)).astype(int)])
# Reshape into groups of 4 and flip each group
allele_ids = np.flip(allele_ids.reshape((-1, 4, 2)), axis=1)
# Flatten into a single array and pack bits into bytes
allele_ids = np.packbits(allele_ids.flatten())
return allele_ids


"""
allele_ids = gt_array.genomics.allele_idxs
gt_bytes = np.fromfile(bed_file, dtype="uint8")
# Ensure the file is valid
CORRECT_FIRST_BYTES = np.array([108, 27, 1], dtype="uint8")
if not (gt_bytes[:3] == CORRECT_FIRST_BYTES).all():
raise ValueError(
f"The first 3 bytes {bed_file.name} were not correct. The file may be corrupted."
)
gt_bytes = gt_bytes[3:]
# Divide array into one row per variant
chunk_size = num_samples // 4
if num_samples % 4 > 0:
chunk_size += 1
gt_bytes = gt_bytes.reshape(-1, chunk_size)
# Process each variant
gt_array_dict = {}
for v_idx, variant in enumerate(variant_list):
variant_gt_bytes = gt_bytes[v_idx]
gt_array = create_gt_array(num_samples, variant_gt_bytes, variant)
if swap_alleles:
gt_array.set_reference(1)
gt_array_dict[f"{v_idx}_{gt_array.variant.id}"] = gt_array
print(f"\tLoaded genotypes from '{bed_file.name}'")
return gt_array_dict
"""

0 comments on commit 14007af

Please sign in to comment.