In [1]:
#!/usr/bin/env python3
#
# Moritz Blumer, 2024

"""
WinPCA. A package for windowed PC analysis.
"""

## IMPORT PACKAGES
import sys
#import pandas as pd


## IMPORT MODULES
from modules import config
from modules.cli import CLI
# from modules.error_handling import ErrorHandling ## add

0.01


In [2]:
## DELETE CELL

# simulate command line arguments
# command_line = 'winpca.ipynb pca -n test --vcf test_dataset/input/sample.vcf.gz -s ind_1,ind_2,ind_3,ind_4,ind_5,ind_6 -r chr1:1-33000000 -w 1000000 -i 100000 -m 0.01 -p auto'
command_line = 'winpca.ipynb pca -n test --vcf test_dataset/input/sample.vcf.gz -s samples.tsv -p auto -r chr1:1-30000000 -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb polarize -n test -p auto -c 2'
#command_line = 'winpca.ipynb chromplot -n test -r chr1:1-30000000 -m test_dataset/input/metadata.tsv -i 10 -f PDF,html -g inversion_state -c ancestral:eb4034,inverted:2f35a8,heterozygous:197d34'
command_line = 'winpca.ipynb genomeplot -p genome_plot_test/ -r chr1,chr2,chr3,chr4,chr5 -m test_dataset/input/metadata.tsv -i 10 -f PDF,html -g inversion_state -c ancestral:eb4034,inverted:2f35a8,heterozygous:197d34'
# command_line = 'winpca.ipynb pca -h'
command_line = 'winpca.ipynb flip -n test --r'
sys.argv = command_line.split(' ')


## PARSE COMMAND LINE ARGUMENTS

# instantiate
cli = CLI()

# call subparsers
cli.pca()
cli.pcangsd()
cli.polarize()
cli.flip()
cli.chromplot()
cli.genomeplot()

# parse argments
cli.parse_args()
args_dct = cli.args_dct

# enter mode
mode = args_dct['winpca']
args_dct

Namespace(flip_pc='1', flip_windows=None, prefix='test', reflect=True, winpca='flip')


{'winpca': 'flip',
 'prefix': 'test',
 'flip_windows': None,
 'reflect': True,
 'flip_pc': '1',
 'skip_monomorphic': False,
 'min_var_per_w': 25,
 'n_prev_windows': 5,
 'pol_pc': 'both',
 'chrom_plot_w': 1200,
 'chrom_plot_h': 400}

In [3]:
# WINDOWED PCA FROM CALLED GENOTYPES

if mode == 'pca':

    # import relevant modules
    from modules.windowed_pca import gt_wpca
    from modules.data import wpca_data

    # instantiate windowed PCA
    w_pca = gt_wpca(
        variant_file_path = args_dct['variant_file_path'],
        sample_lst = args_dct['sample_lst'],
        chrom = args_dct['chrom'],
        start = args_dct['start'],
        stop = args_dct['end'],
        w_size = args_dct['w_size'],
        w_step = args_dct['w_step'],
        skip_monomorphic=config.skip_monomorphic,
        )
    
    # run
    w_pca.win_vcf_gt()

    # parse run data
    data = wpca_data(args_dct['prefix'], w_pca)


# WINDOWED PCA FROM CALLED GENOTYPES

# elif mode == 'pcangsd':

#     [...]

# EXISTING DATA:

else:
    from modules.data import wpca_data
    data = wpca_data(args_dct['prefix'])


[INFO] Reading data from prefix "test*".


In [5]:
# POLARIZE

# polarize
if mode in ['pca', 'pcangsd', 'polarize'] \
    and not args_dct['polarize'] == 'skip':
    
    from modules.transform_data import Polarize
    polarize = Polarize()

    # adaptive
    if args_dct['polarize'] == 'auto':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.adaptive, args_dct['n_prev_windows']
            )
            data.modify_data(
                'pc_2', polarize.adaptive, args_dct['n_prev_windows']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.adaptive, \
                    args_dct['n_prev_windows']
            )

    # using guide samples
    if args_dct['polarize'] == 'guide_samples':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.guide_samples, args_dct['guide_sample_lst']
            )
            data.modify_data(
                'pc_2', polarize.guide_samples, args_dct['guide_sample_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.guide_samples, \
                    args_dct['guide_sample_lst']
            )
else:
    print('skip') ### DELETE



In [8]:
# FLIP

if mode == 'flip':
    from modules.transform_data import Flip
    flip = Flip()
    if args_dct['flip_pc'] == 'both':
        data.modify_data('pc_1', flip.flip_chrom)
        data.modify_data('pc_2', flip.flip_chrom)
    else:
        data.modify_data('pc_' + str(args_dct['flip_pc']), flip.flip_chrom)
else:
    print('skip') ### DELETE

skip


In [82]:
# # WRITE RESULTS

# # create output directory if prefix contains '/'
# if '/' in args_dct['prefix']:
#     if not os.path.exists('/'.join(args_dct['prefix'].split('/')[0:-1]) + '/'):
#         os.makedirs('/'.join(args_dct['prefix'].split('/')[0:-1]) + '/')

# # write results to files
# data.to_files()

In [39]:

from modules import Plot

# plot = Plot('pc_1', stat_var='pc_1_ve', prefix='test', data=data,
#             chrom=args_dct['start'], start=args_dct['start'], end=args_dct['end'], 
#             color_by=args_dct['color_by'], 
#             metadata_path=args_dct['metadata_path'], 
#             interval=args_dct['interval'], 
#             plot_fmt_lst=args_dct['plot_fmt_lst'],
# )

plot = Plot('pc_1', 
            run_prefix=args_dct['run_prefix'],
            run_id_lst=args_dct['run_id_lst'],
            color_by=args_dct['color_by'], 
            metadata_path=args_dct['metadata_path'], 
            interval=args_dct['interval'], 
            plot_fmt_lst=args_dct['plot_fmt_lst'],
)

plot.genomeplot()
plot.fig.show()


[INFO] Reading data from prefix "genome_plot_test/chr1*".

[INFO] Reading data from prefix "genome_plot_test/chr2*".

[INFO] Reading data from prefix "genome_plot_test/chr3*".

[INFO] Reading data from prefix "genome_plot_test/chr4*".

[INFO] Reading data from prefix "genome_plot_test/chr5*".
