In [9]:
## COMMAND LINE
import sys

# simulate command line arguments
command_line = 'winpca.ipynb pca phyllotis.chr11.beagle.gz chr11:1-119662310 phy_tests -w 250000 -i 5000000 -v GL'
command_line = command_line.strip()
sys.argv = command_line.split(' ')

In [10]:
#!/usr/bin/env python3

"""
WinPCA. A package for windowed PC analysis.
"""

## IMPORT CONFIG
from modules import config

## IMPORT MODULES
from modules.cli import CLI
from modules.data import WPCAData
from modules.log import Log

## INSTANTIATE LOGGER
log = Log()



#!/usr/bin/env python3

"""
WinPCA. A package for windowed PC analysis.
"""

## IMPORT CONFIG
from modules import config

## IMPORT MODULES
from modules.cli import CLI
from modules.data import WPCAData
from modules.log import Log

## INSTANTIATE LOGGER
log = Log()



## MAIN

# def main():
#     '''
#     Execute WinPCA.
#     '''

## PARSE COMMAND LINE ARGUMENTS

# instantiate, call subparsers & parse
cli = CLI()
cli.pca()
cli.polarize()
cli.flip()
cli.chromplot()
cli.genomeplot()
cli.parse_args()
args_dct = cli.args_dct

# set mode
mode = args_dct['winpca']


## MODE: WINDOWED PCA

if mode == 'pca':

    # print info
    log.newline()
    log.info('Performing windowed PCA')
    log.newline()

    # determine file format
    if args_dct['variant_file_path'].endswith('.gz'):
        file_fmt = args_dct['variant_file_path'].split('.')[-2].upper()
    else:
        file_fmt = args_dct['variant_file_path'].split('.')[-1].upper()

    # update n_threads/var_fmt in CONFIG
    config.N_THREADS = args_dct['n_threads']
    config.VAR_FMT = args_dct['var_fmt']

    # load module & instantiate
    from modules.windowed_pca import WPCA
    wpca = WPCA(
        variant_file_path = args_dct['variant_file_path'],
        file_fmt=file_fmt,
        var_fmt=args_dct['var_fmt'],
        sample_lst=args_dct['sample_lst'],
        chrom=args_dct['chrom'],
        start=args_dct['start'],
        stop=args_dct['end'],
        w_size=args_dct['w_size'],
        w_step=args_dct['w_step'],
        min_var_per_w=args_dct['min_var_per_w'],
        skip_monomorphic=args_dct['skip_monomorphic'],
        gt_mean_impute=args_dct['gt_mean_impute'],
        vcf_pass_filter=args_dct['vcf_pass_filter'],
        min_maf=args_dct['min_maf'],
        n_threads=args_dct['n_threads'],
        )

    # run, parse and write output
    wpca.window_parser()
    data = WPCAData(args_dct['prefix'], wpca)


## ELSE READ IN EXISTING DATA:

if mode in ['polarize', 'flip', 'chromplot']:

    # read data
    data = WPCAData(args_dct['prefix'])


## MODE: POLARIZE

if mode in ['pca', 'polarize'] \
    and not args_dct['polarize'] == 'skip':

    # print info
    log.newline()
    log.info('Polarizing PC data')

    # load module & instantiate
    from modules.transform_data import Polarize
    polarize = Polarize()

    # adaptive
    if args_dct['polarize'] == 'auto':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.adaptive, args_dct['n_prev_windows']
            )
            data.modify_data(
                'pc_2', polarize.adaptive, args_dct['n_prev_windows']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.adaptive, \
                    args_dct['n_prev_windows']
            )

    # using guide samples
    if args_dct['polarize'] == 'guide_samples':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.guide_samples, args_dct['guide_sample_lst']
            )
            data.modify_data(
                'pc_2', polarize.guide_samples, args_dct['guide_sample_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.guide_samples, \
                    args_dct['guide_sample_lst']
            )


## MODE: FLIP

if mode == 'flip':

    # print info
    log.newline()
    log.info('Flipping PC data')

    # load module & instantiate
    from modules.transform_data import Flip
    flip = Flip()

    # reflect (entire)
    if args_dct['reflect']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_chrom
            )
            data.modify_data(
                'pc_2', flip.flip_chrom
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_chrom
            )
    # flip specified windows
    if args_dct['flip_windows']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_windows,
                args_dct['flip_window_lst']
            )
            data.modify_data(
                'pc_2', flip.flip_windows,
                args_dct['flip_window_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_windows,
                args_dct['flip_window_lst']
            )


## WRITE DATA TO FILES

# except plot modes
if mode in ['pca', 'polarize', 'flip']:
    data.to_files()


## MODE: CHROMPLOT

if mode  == 'chromplot':

    # print info
    log.newline()
    log.info('Creating chromosome plot')

    # infer stat_var
    if args_dct['plot_var'] == 'pc_1':
        stat_var = f'{args_dct["plot_var"]}_ve'
    if args_dct['plot_var'] == 'pc_2':
        stat_var = f'{args_dct["plot_var"]}_ve'
    if args_dct['plot_var'] == 'hetp':
        stat_var = 'n_var'

    # load module & instantiate
    from modules.plot import Plot
    plot = Plot(args_dct['plot_var'],
                stat_var=stat_var,
                prefix=args_dct['prefix'],
                data=data,
                chrom=args_dct['start'],
                start=args_dct['start'],
                end=args_dct['end'],
                metadata_path=args_dct['metadata_path'],
                color_by=args_dct['color_by'],
                hex_code_dct=args_dct['hex_code_dct'],
                interval=args_dct['interval'],
                chromplot_w=config.CHROMPLOT_W,
                chromplot_h=config.CHROMPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],
                numeric=args_dct['numeric'],
                reverse=args_dct['reverse'],
    )
    plot.chromplot()
    plot.savefig()


# MODE: GENOMEPLOT

if mode == 'genomeplot':

    # print info
    log.newline()
    log.info('Creating genome-wide plot')

    # load module & instantiate
    from modules.plot import Plot
    plot = Plot(args_dct['plot_var'],
                run_prefix=args_dct['run_prefix'],
                run_id_lst=args_dct['run_id_lst'],
                metadata_path=args_dct['metadata_path'],
                color_by=args_dct['color_by'],
                hex_code_dct=args_dct['hex_code_dct'],
                interval=args_dct['interval'],
                genomeplot_w=config.GENOMEPLOT_W,
                genomeplot_h=config.GENOMEPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],
                numeric=args_dct['numeric'],
                reverse=args_dct['reverse'],
    )
    plot.genomeplot()
    plot.savefig()


# END

# print info
log.newline()
log.info('Done')
log.newline()

# # # EXECUTE
# # if __name__ == "__main__":
# #     main()



[INFO] Performing windowed PCA.




BEAGLE
Copy variant_sample_lst
Copy 2 columns


[INFO] Processed 1/24 windows.
[INFO] Processed 2/24 windows.


[INFO] Processed 3/24 windows.
[INFO] Processed 4/24 windows.
[INFO] Processed 5/24 windows.
[INFO] Processed 6/24 windows.
[INFO] Processed 7/24 windows.
[INFO] Processed 8/24 windows.
[INFO] Processed 9/24 windows.
[INFO] Processed 10/24 windows.
[INFO] Processed 11/24 windows.
[INFO] Processed 12/24 windows.
[INFO] Processed 13/24 windows.
[INFO] Processed 14/24 windows.
[INFO] Processed 15/24 windows.
[INFO] Processed 16/24 windows.
[INFO] Processed 17/24 windows.
[INFO] Processed 18/24 windows.
[INFO] Processed 19/24 windows.
[INFO] Processed 20/24 windows.
[INFO] Processed 21/24 windows.
[INFO] Processed 22/24 windows.
[INFO] Processed 23/24 windows.
[INFO] Processed 24/24 windows.


[INFO] Processed all windows.


[INFO] Polarizing PC data.


[INFO] Done.




In [1]:
# args_dct['plot_var'] = 'pc_1'
# args_dct['metadata_path'] = 'phyllotis.metadata.tsv'
# args_dct['color_by'] = 'species'
# args_dct['hex_code_dct'] = None
# args_dct['interval'] = 1
# args_dct['plot_fmt_lst'] = ['PDF']
# args_dct['numeric'] = False
# args_dct['reverse'] = False

# # infer stat_var
# if args_dct['plot_var'] == 'pc_1':
#     stat_var = f'{args_dct["plot_var"]}_ve'
# if args_dct['plot_var'] == 'pc_2':
#     stat_var = f'{args_dct["plot_var"]}_ve'
# if args_dct['plot_var'] == 'hetp':
#     stat_var = 'n_var'

# # load module & instantiate
# from modules.plot import Plot
# plot = Plot(args_dct['plot_var'],
#             stat_var=stat_var,
#             prefix=args_dct['prefix'],
#             data=data,
#             chrom=args_dct['start'],
#             start=args_dct['start'],
#             end=args_dct['end'],
#             metadata_path=args_dct['metadata_path'],
#             color_by=args_dct['color_by'],
#             hex_code_dct=args_dct['hex_code_dct'],
#             interval=args_dct['interval'],
#             chromplot_w=config.CHROMPLOT_W,
#             chromplot_h=config.CHROMPLOT_H,
#             plot_fmt_lst=args_dct['plot_fmt_lst'],
#             numeric=args_dct['numeric'],
#             reverse=args_dct['reverse'],
# )
# plot.chromplot()
# plot.fig.show()

In [3]:
import pandas as pd

In [58]:
df_dct = pd.read_csv('/Users/moritzblumer/Downloads/12936_2021_3757_MOESM1_ESM.csv', index_col=0).to_dict(orient='index')

out_dct = {}

for i in set([x[:-1] for x in df_dct.keys()]):
    out_dct[i] = {}
    out_dct[i]['haplotype'] = '-'.join(sorted([df_dct[i + 'a']['Clusters'], df_dct[i + 'b']['Clusters']])).replace('C1-wt', 'wt-C1').replace('C2-wt', 'wt-C2').replace('C3-wt', 'wt-C3').replace('C4-wt', 'wt-C4').replace('C5-wt', 'wt-C5')

pd.DataFrame.from_dict(out_dct, orient='index').to_csv('~/Downloads/CYP6M2_haplotypes.tsv', sep='\t')


In [57]:
df_dct = pd.read_csv('/Users/moritzblumer/Downloads/supp_table_haplotype_group_comparison_panel.csv', index_col=0).to_dict(orient='index')

out_dct = {}

for i in set([x[:-1] for x in df_dct.keys()]):
    out_dct[i] = {}
    out_dct[i]['haplotype'] = '-'.join(sorted([df_dct[i + 'a']['super_hierarchy_haplotype'], df_dct[i + 'b']['super_hierarchy_haplotype']]))

pd.DataFrame.from_dict(out_dct, orient='index').to_csv('~/Downloads/VGSC_haplotypes.tsv', sep='\t')


In [55]:
set(pd.DataFrame.from_dict(out_dct, orient='index')['haplotype'])

{'F-F', 'F-S', 'F-WT', 'S-S', 'S-WT', 'WT-WT'}

In [41]:
import numpy as np
import allel

arr =  np.load('/Users/moritzblumer/Downloads/hierarchical_cluster_membership.npy')
string_arr = np.char.decode(arr, 'ascii')
len(string_arr)

2284

In [37]:
with open('/Users/moritzblumer/Downloads/hierarchical_cluster_membership.npy', 'rb') as f:
    content = f.read()
content

b"\x93NUMPY\x01\x00v\x00{'descr': '|S2', 'fortran_order': False, 'shape': (2284,), }                                                         \nF1F1F1F1F1F1F1\x00\x00F1\x00\x00F1F1F1\x00\x00F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00\x00\x00F1F1F1F1F1\x00\x00F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1\x00\x00\x00\x00F1F1F1\x00\x00F1\x00\x00F1F1F1F1F1F1\x00\x00\x00\x00\x00\x00F1F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1\x00\x00F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1\x00\x00F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F1F1F1F1\x00\x00F1\x00\x00\x00\x00F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1F1\x00\x00F1F1F1F

In [42]:
# load haplotypes
callset_haps = np.load('/Users/moritzblumer/Downloads/haps_phase2.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
n_variants = haps.shape[0]
n_haps = haps.shape[1]
n_variants, n_haps

(390588, 2284)

In [44]:
haps

Unnamed: 0,0,1,2,3,4,...,2279,2280,2281,2282,2283,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,0,0,0,0,0,...,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
390585,0,0,0,0,0,...,0,0,0,0,0,
390586,0,0,0,0,0,...,0,0,0,0,0,
390587,0,0,0,0,0,...,0,0,0,0,0,
