In [3]:
#!/usr/bin/env python3
#
# Moritz Blumer, 2024

"""
WinPCA. A package for windowed PC analysis.
"""

## IMPORT MODULES
from modules import config                                                      ### DELETE
from modules.cli import CLI
from modules.windowed_pca import WPCA
from modules.data import WPCA_Data
from modules.transform_data import Polarize
from modules.transform_data import Flip

# from modules.error_handling import ErrorHandling ## add

##### DELETE #####
import sys

# simulate command line arguments

command_line = 'winpca.ipynb pca test_dataset/input/sample.vcf.gz chr1:1-30000000 test/vcf-gt -s samples.tsv -p auto -w 1000000 -i 100000 -m 0.01'
command_line = 'winpca.ipynb pca test_dataset/input/sample.vcf.gz chr1:1-30000000 test/vcf-pl -v PL -s samples.tsv -p auto -w 1000000 -i 100000 -m 0.01'
command_line = 'winpca.ipynb pca test_dataset/input/sample.gt.tsv.gz chr1:1-30000000 test/tsv-gt -s samples.tsv -p auto -w 1000000 -i 100000 -m 0.01'
command_line = 'winpca.ipynb pca test_dataset/input/sample.pl.tsv.gz chr1:1-30000000 test/tsv-pl -v PL -s samples.tsv -p auto -w 1000000 -i 100000 -m 0.01'
command_line = 'winpca.ipynb pca test_dataset/input/sample.gl.tsv.gz chr1:1-30000000 test/tsv-gl -v GL -s samples.tsv -p auto -w 1000000 -i 100000 -m 0.01'

# command_line = 'winpca.ipynb polarize -n test -p auto -c 2'
# command_line = 'winpca.ipynb chromplot -n test -r chr1:1-30000000 -m test_dataset/input/metadata.tsv -i 10 -f PDF,html -g inversion_state -c ancestral:eb4034,inverted:2f35a8,heterozygous:197d34'
# command_line = 'winpca.ipynb genomeplot -p genome_plot_test/ -r chr1,chr2,chr3,chr4,chr5 -m test_dataset/input/metadata.tsv -i 10 -f PDF,html -g inversion_state -c ancestral:eb4034,inverted:2f35a8,heterozygous:197d34'
# command_line = 'winpca.ipynb pca -h'
# command_line = 'winpca.ipynb flip -n test --r -w flip_windows.tsv'

# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz 2L:1-49364325 Bamako.2L.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz 2R:1-61545105 Bamako.2R.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz 3L:1-41963435 Bamako.3L.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz 3R:1-53200684 Bamako.3R.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz X:1-24393108 Bamako.X.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# command_line = 'winpca.ipynb pca /Users/moritzblumer/Downloads/all.Bamakoset.2.recode.vcf.gz UNKN:1-42389979 Bamako.UNKN.all -s Bamako.samples.lst -p auto -w 1000000 -i 100000 -m 0.01'
# # command_line = 'winpca.ipynb chromplot Bamako.2L 2L:1-49364325 -f PDF,html'
# command_line = 'winpca.ipynb genomeplot Bamako. 2L.all,2R.all,3L.all,3R.all,X.all,UNKN.all -m Bamako.metadata.tsv -g taxon -f PDF,html'
sys.argv = command_line.split(' ')
##### DELETE #####


## PARSE COMMAND LINE ARGUMENTS

# instantiate, call subparsers & parse
cli = CLI()
cli.pca()
cli.polarize()
cli.flip()
cli.chromplot()
cli.genomeplot()
cli.parse_args()
args_dct = cli.args_dct

# enter mode
mode = args_dct['winpca']


## WINDOWED PCA

if mode == 'pca':

    # determine file format
    if args_dct['variant_file_path'].endswith('.gz'):
        file_fmt = args_dct['variant_file_path'].split('.')[-2].upper()
    else:
        file_fmt = args_dct['variant_file_path'].split('.')[-1].upper()

    # instantiate windowed PCA
    wpca = WPCA(
        variant_file_path = args_dct['variant_file_path'],
        file_fmt=file_fmt,
        var_fmt=args_dct['var_fmt'],
        sample_lst=args_dct['sample_lst'],
        chrom=args_dct['chrom'],
        start=args_dct['start'],
        stop=args_dct['end'],
        w_size=args_dct['w_size'],
        w_step=args_dct['w_step'],
        min_var_per_w=args_dct['min_var_per_w'],
        skip_monomorphic=args_dct['skip_monomorphic'],
        min_maf=args_dct['min_maf'],
        n_threads=args_dct['n_threads']
        )
    
    # run, parse and write output
    wpca.window_parser()
    data = WPCA_Data(args_dct['prefix'], wpca)


## ELSE READ IN EXISTING DATA:

else:
    data = WPCA_Data(args_dct['prefix'])


## POLARIZE

if mode in ['pca', 'pcangsd', 'polarize'] \
    and not args_dct['polarize'] == 'skip':
    
    polarize = Polarize()

    # adaptive
    if args_dct['polarize'] == 'auto':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.adaptive, args_dct['n_prev_windows']
            )
            data.modify_data(
                'pc_2', polarize.adaptive, args_dct['n_prev_windows']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.adaptive, \
                    args_dct['n_prev_windows']
            )

    # using guide samples
    if args_dct['polarize'] == 'guide_samples':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.guide_samples, args_dct['guide_sample_lst']
            )
            data.modify_data(
                'pc_2', polarize.guide_samples, args_dct['guide_sample_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.guide_samples, \
                    args_dct['guide_sample_lst']
            )


## FLIP

if mode == 'flip':
    
    flip = Flip()

    # reflect (entire)
    if args_dct['reflect']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_chrom
            )
            data.modify_data(
                'pc_2', flip.flip_chrom
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_chrom
            )
    # flip specified windows
    if args_dct['flip_windows']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_windows, 
                args_dct['flip_window_lst']
            )
            data.modify_data(
                'pc_2', flip.flip_windows, 
                args_dct['flip_window_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_windows,
                args_dct['flip_window_lst']
            )


## WRITE DATA TO FILES

data.to_files()


## CHROMPLOT

if mode  == 'chromplot':
    from modules.plot import Plot

    plot = Plot('pc_1', stat_var='pc_1_ve', prefix=args_dct['prefix'], data=data,
                chrom=args_dct['start'], start=args_dct['start'], end=args_dct['end'], 
                color_by=args_dct['color_by'], 
                metadata_path=args_dct['metadata_path'], 
                interval=args_dct['interval'], 
                chromplot_w=config.CHROMPLOT_W,
                chromplot_h=config.CHROMPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],        
    )
    plot.chromplot()
    plot.savefig()


# GENOMEPLOT

if mode == 'genomeplot':
    from modules.plot import Plot
    plot = Plot('pc_1', 
                run_prefix=args_dct['run_prefix'],
                run_id_lst=args_dct['run_id_lst'],
                color_by=args_dct['color_by'], 
                metadata_path=args_dct['metadata_path'], 
                interval=args_dct['interval'],
                genomeplot_w=config.GENOMEPLOT_W,
                genomeplot_h=config.GENOMEPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],
    )
    plot.genomeplot()
    plot.savefig()

[INFO] Processed 1 of 291 windows
[INFO] Processed 2 of 291 windows
[INFO] Processed 3 of 291 windows
[INFO] Processed 4 of 291 windows
[INFO] Processed 5 of 291 windows
[INFO] Processed 6 of 291 windows
[INFO] Processed 7 of 291 windows
[INFO] Processed 8 of 291 windows
[INFO] Processed 9 of 291 windows
[INFO] Processed 10 of 291 windows
[INFO] Processed 11 of 291 windows
[INFO] Processed 12 of 291 windows
[INFO] Processed 13 of 291 windows
[INFO] Processed 14 of 291 windows
[INFO] Processed 15 of 291 windows
[INFO] Processed 16 of 291 windows
[INFO] Processed 17 of 291 windows
[INFO] Processed 18 of 291 windows
[INFO] Processed 19 of 291 windows
[INFO] Processed 20 of 291 windows
[INFO] Processed 21 of 291 windows
[INFO] Processed 22 of 291 windows
[INFO] Processed 23 of 291 windows
[INFO] Processed 24 of 291 windows
[INFO] Processed 25 of 291 windows
[INFO] Processed 26 of 291 windows
[INFO] Processed 27 of 291 windows
[INFO] Processed 28 of 291 windows
[INFO] Processed 29 of 291 wi

In [3]:
mode = 'chromplot'
args_dct['color_by'] = 'inversion_state'
args_dct['metadata_path'] = 'test_dataset/input/metadata.tsv'
args_dct['interval'] = None
args_dct['plot_fmt_lst'] = ['pdf', 'html']






# WRITE RESULTS

# write results to files
data.to_files()

In [2]:
# 3

plot.fig

NameError: name 'plot' is not defined

In [9]:
plot.fig

In [4]:
a = []
if a == []:
    print('b')

b


In [7]:
import numpy as np

gls = np.delete(gls, np.s_[2::3], axis=1)

np.delete method: 0.00775 seconds
boolean mask method: 0.00764 seconds


In [None]:
gt_fields = ['0/0:0,15,172', '0/0:0,18,186', '0/0:0,51,255']
sample_idx_lst = 
gls = [
    gt_fields[idx].split(':')[

        format.index(self.var_fmt)].split(',') \
        

        for idx in sample_idx_lst
]
