In [2]:
import os
from argparse import ArgumentParser

In [31]:
### Arguments
clap = ArgumentParser(prog="Hail PCA Wrapper",
                      description="Convenience wrapper for performing hail PCA")
clap.add_argument('-r', '--reference', default='GRCh38', choices=['GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'],
                  help='the (hail-supported) background genome of the analysis')
clap.add_argument('-b', '--bucket', default=os.environ.get('WORKSPACE_BUCKET', None),
                  help='cloud bucket prefix to use for saving hail files')
clasp = clap.add_subparsers(required=True, metavar='', dest='proc',
                            description='Reference and Sample Operations')

### Arguments for building the reference projcetion
buildref_clap = clasp.add_parser('build-reference',
                                 help='Build reference PC projection and ancestry Random-Forest')

file1_args = buildref_clap.add_argument_group('Files')
file1_args.add_argument('reference-vcf', help='reference cohort VCF')
file1_args.add_argument('population-tsv', help='reference sample population assignments, must have: header, all samples in reference-vcf')
file1_args.add_argument('-s', '--samples', help='sample vcf, if supplied will be projected and ancestry-inferred')

buildref_clap.add_argument('-c', '--pop-col', required=True, type=int,
                           help='column with population class in population-tsv')

pca_args = buildref_clap.add_argument_group('PCA')
pca_args.add_argument('-k', default=10, type=int,
                      help='number of PCs to calculate')
pca_args.add_argument('--af-min', default=0.01, type=float,
                      help='minimum allele-frequency filter')
pca_args.add_argument('--hwe-p', default=1e-6, type=float,
                      help='Hardy-Weinberg p-value filter')
pca_args.add_argument('--ld-r2', default=0.1, type=float,
                      help='linkage disequilibrium correlation filter')

### Arguments for projecting and inferring a sample set
infer_clap = clasp.add_parser('infer-samples', help='project samples and infer ancestries using premade reference')

file2_args = infer_clap.add_argument_group('Files')
file2_args.add_argument('sample-vcf', help='sample cohort VCF')
file2_args.add_argument('refloadings', help='Hail table with reference pc loadings and afs, (note: cannot read from local file system)')
file2_args.add_argument('refRFmodel', help='joblib dump of a sklearn RandomForestClassifier trained on reference PCs -> population class')


_StoreAction(option_strings=[], dest='refRFmodel', nargs=None, const=None, default=None, type=None, choices=None, required=True, help='joblib dump of a sklearn RandomForestClassifier trained on reference PCs -> population class', metavar=None)

In [37]:
a = clap.parse_args(['build-reference', 'a', 'b' ,'-c', '5', '-s', 'd'])

In [38]:
vars(a)

{'reference': 'GRCh38',
 'bucket': None,
 'proc': 'build-reference',
 'reference-vcf': 'a',
 'population-tsv': 'b',
 'samples': 'd',
 'pop_col': 5,
 'k': 10,
 'af_min': 0.01,
 'hwe_p': 1e-06,
 'ld_r2': 0.1}