In [5]:
## COMMAND LINE
import sys

# simulate command line arguments

command_line = 'winpca.ipynb pca anopheles.vcf.gz 2L:1-10000000 anopheles_miss -i 10000'
command_line = 'winpca.ipynb chromplot anopheles_miss 2L:1-10000000'
command_line = 'winpca.ipynb pca anopheles.vcf.gz 2L:1-10000000 anopheles_nomiss -i 10000'
command_line = 'winpca.ipynb chromplot anopheles_nomiss 2L:1-10000000'

sys.argv = command_line.split(' ')





In [6]:
#!/usr/bin/env python3

"""
WinPCA. A package for windowed PC analysis.
"""

## IMPORT CONFIG
from modules import config

## IMPORT MODULES
from modules.cli import CLI
from modules.data import WPCAData
from modules.log import Log

## INSTANTIATE LOGGER
log = Log()



## MAIN

# def main():
#     '''
#     Execute WinPCA.
#     '''

## PARSE COMMAND LINE ARGUMENTS

# instantiate, call subparsers & parse
cli = CLI()
cli.pca()
cli.polarize()
cli.flip()
cli.chromplot()
cli.genomeplot()
cli.parse_args()
args_dct = cli.args_dct

# set mode
mode = args_dct['winpca']


## MODE: WINDOWED PCA

if mode == 'pca':

    # print info
    log.newline()
    log.info('Performing windowed PCA')
    log.newline()

    # determine file format
    if args_dct['variant_file_path'].endswith('.gz'):
        file_fmt = args_dct['variant_file_path'].split('.')[-2].upper()
    else:
        file_fmt = args_dct['variant_file_path'].split('.')[-1].upper()

    # update n_threads/var_fmt in CONFIG
    config.N_THREADS = args_dct['n_threads']
    config.VAR_FMT = args_dct['var_fmt']

    # load module & instantiate
    from modules.windowed_pca import WPCA
    wpca = WPCA(
        variant_file_path = args_dct['variant_file_path'],
        file_fmt=file_fmt,
        var_fmt=args_dct['var_fmt'],
        sample_lst=args_dct['sample_lst'],
        chrom=args_dct['chrom'],
        start=args_dct['start'],
        stop=args_dct['end'],
        w_size=args_dct['w_size'],
        w_step=args_dct['w_step'],
        min_var_per_w=args_dct['min_var_per_w'],
        skip_monomorphic=args_dct['skip_monomorphic'],
        vcf_pass_filter=args_dct['vcf_pass_filter'],
        min_maf=args_dct['min_maf'],
        n_threads=args_dct['n_threads'],
        )

    # run, parse and write output
    wpca.window_parser()
    data = WPCAData(args_dct['prefix'], wpca)


## ELSE READ IN EXISTING DATA:

if mode in ['polarize', 'flip', 'chromplot']:

    # read data
    data = WPCAData(args_dct['prefix'])


## MODE: POLARIZE

if mode in ['pca', 'polarize'] \
    and not args_dct['polarize'] == 'skip':

    # print info
    log.newline()
    log.info('Polarizing PC data')

    # load module & instantiate
    from modules.transform_data import Polarize
    polarize = Polarize()

    # adaptive
    if args_dct['polarize'] == 'auto':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.adaptive, args_dct['n_prev_windows']
            )
            data.modify_data(
                'pc_2', polarize.adaptive, args_dct['n_prev_windows']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.adaptive, \
                    args_dct['n_prev_windows']
            )

    # using guide samples
    if args_dct['polarize'] == 'guide_samples':
        if args_dct['pol_pc'] == 'both':
            data.modify_data(
                'pc_1', polarize.guide_samples, args_dct['guide_sample_lst']
            )
            data.modify_data(
                'pc_2', polarize.guide_samples, args_dct['guide_sample_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['pol_pc']), polarize.guide_samples, \
                    args_dct['guide_sample_lst']
            )


## MODE: FLIP

if mode == 'flip':

    # print info
    log.newline()
    log.info('Flipping PC data')

    # load module & instantiate
    from modules.transform_data import Flip
    flip = Flip()

    # reflect (entire)
    if args_dct['reflect']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_chrom
            )
            data.modify_data(
                'pc_2', flip.flip_chrom
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_chrom
            )
    # flip specified windows
    if args_dct['flip_windows']:
        if args_dct['flip_pc'] == 'both':
            data.modify_data(
                'pc_1', flip.flip_windows,
                args_dct['flip_window_lst']
            )
            data.modify_data(
                'pc_2', flip.flip_windows,
                args_dct['flip_window_lst']
            )
        else:
            data.modify_data(
                'pc_' + str(args_dct['flip_pc']), flip.flip_windows,
                args_dct['flip_window_lst']
            )


## WRITE DATA TO FILES

# except plot modes
if mode in ['pca', 'polarize', 'flip']:
    data.to_files()


## MODE: CHROMPLOT

if mode  == 'chromplot':

    # print info
    log.newline()
    log.info('Creating chromosome plot')

    # infer stat_var
    if args_dct['plot_var'] == 'pc_1':
        stat_var = f'{args_dct["plot_var"]}_ve'
    if args_dct['plot_var'] == 'pc_2':
        stat_var = f'{args_dct["plot_var"]}_ve'
    if args_dct['plot_var'] == 'hetp':
        stat_var = 'n_var'

    # load module & instantiate
    from modules.plot import Plot
    plot = Plot(args_dct['plot_var'],
                stat_var=stat_var,
                prefix=args_dct['prefix'],
                data=data,
                chrom=args_dct['start'],
                start=args_dct['start'],
                end=args_dct['end'],
                metadata_path=args_dct['metadata_path'],
                color_by=args_dct['color_by'],
                hex_code_dct=args_dct['hex_code_dct'],
                interval=args_dct['interval'],
                chromplot_w=config.CHROMPLOT_W,
                chromplot_h=config.CHROMPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],
    )
    plot.chromplot()
    plot.savefig()


# MODE: GENOMEPLOT

if mode == 'genomeplot':

    # print info
    log.newline()
    log.info('Creating genome-wide plot')

    # load module & instantiate
    from modules.plot import Plot
    plot = Plot(args_dct['plot_var'],
                run_prefix=args_dct['run_prefix'],
                run_id_lst=args_dct['run_id_lst'],
                metadata_path=args_dct['metadata_path'],
                color_by=args_dct['color_by'],
                hex_code_dct=args_dct['hex_code_dct'],
                interval=args_dct['interval'],
                genomeplot_w=config.GENOMEPLOT_W,
                genomeplot_h=config.GENOMEPLOT_H,
                plot_fmt_lst=args_dct['plot_fmt_lst'],
    )
    plot.genomeplot()
    plot.savefig()


# END

# print info
log.newline()
log.info('Done')
log.newline()


# # EXECUTE
# if __name__ == "__main__":
#     main()




[INFO] Reading data from prefix "anopheles_nomiss*".


[INFO] Creating chromosome plot.


[INFO] Done.




In [9]:
plot.fig

In [7]:
plot.fig

In [12]:
import plotly.express as px
import numpy as np
import allel

test_arr = np.array(
    [
        [0, 2, 0],
        [0, 2, 2],
        [0, 2, 2],
        [0, 2, 2],
    ]
)

pca = allel.pca(
    test_arr,
    n_components=2,
    copy=True,
    scaler='patterson',
    ploidy=2,
)

# Create order annotations (0, 1, 2, 3,...)
orders = np.arange(len(pca[0]))

# Create scatter plot with hover annotations
fig = px.scatter(x=pca[0][:, 0], y=pca[0][:, 1], hover_name=orders)

# Show the plot
fig.show()

In [24]:
import plotly.express as px
import numpy as np
import allel

test_arr = np.array(
    [
        [0.5, 2, 0],
        [0, 2, 2],
        [2, 2, 2],
        [0, 2, 2],
    ]
)

pca = allel.pca(
    test_arr,
    n_components=2,
    copy=True,
    scaler='patterson',
    ploidy=2,
)

# Create order annotations (0, 1, 2, 3,...)
orders = np.arange(len(pca[0]))

# Create scatter plot with hover annotations
fig = px.scatter(x=pca[0][:, 0], y=pca[0][:, 1], hover_name=orders)

# Show the plot
fig.show()

ValueError: array must not contain infs or NaNs

In [12]:
import numpy as np
from scipy import stats

# Original genotype array with None
gt_array = np.array([
    [None, 1, 2, 0],
    [0, 1, 2, 1],
    [None, 1, 2, 0],
], dtype=object)

# Function to impute missing values with the mode for each row
# Iterate over each row (site)
for i in range(gt_array.shape[0]):
    row = gt_array[i]
    
    # Filter out None values to compute the mode of the non-None values
    non_none_values = [x for x in row if x is not None]
    
    # Compute the mode of the non-None values
    if non_none_values:
        mode_val = stats.mode(non_none_values)[0][0]
    
        # Replace None with the mode
        row = [mode_val if x is None else x for x in row]
        gt_array[i] = row

gt_array





array([[0, 1, 2, 0],
       [0, 1, 2, 1],
       [0, 1, 2, 0]], dtype=object)

In [14]:
stats.mode(non_none_values)[0][0]





0

In [29]:
test_arr = np.array(
    [
        [0, 1, 2, 0],
        [1, 2, 1, 0],
        [np.nan, 1, 2, np.nan],
        [0, 1, 2, 1],
        [0, 1, 2, 1],
        [0, 1, 2, 1],
    ]
)
print(test_arr)

# Step 1: Create a mask for rows that do NOT contain NaN
mask = ~np.isnan(test_arr).any(axis=1)

# Step 2: Use the mask to filter rows and drop those containing NaN
filtered_array = test_arr[mask]

print(filtered_array)


[[ 0.  1.  2.  0.]
 [ 1.  2.  1.  0.]
 [nan  1.  2. nan]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]]
[[0. 1. 2. 0.]
 [1. 2. 1. 0.]
 [0. 1. 2. 1.]
 [0. 1. 2. 1.]
 [0. 1. 2. 1.]]


In [31]:
test_arr.shape[0]

6

In [35]:


w_gt_arr = np.array(
    [
        [0, 1, 2, 0],
        [1, 2, 1, 0],
        [np.nan, 1, 2, np.nan],
        [0, 1, 2, 1],
        [0, 1, 2, 1],
        [0, 1, 2, 1],
    ]
)
print(w_gt_arr)
def gt_min_maf_filter(w_gt_arr):
    '''
    Drop SNPs with minor allele frequency below specified value.
    '''

    # allele count
    n_alleles = 2 * w_gt_arr.shape[1]

    # calculate allel frequencies and multiple with -1 if AF > 0.5 (because
    # input data may not be polarized by major/minor allel)
    afs = np.nansum(w_gt_arr, axis=1) / n_alleles
    afs[afs > 0.5] = 1 - afs[afs > 0.5]

    # keep only sites where AF >= min_maf
    w_gt_arr = w_gt_arr[afs >= min_maf]

    return w_gt_arr


gt_min_maf_filter(w_gt_arr)


[[ 0.  1.  2.  0.]
 [ 1.  2.  1.  0.]
 [nan  1.  2. nan]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]]


array([[0., 1., 2., 0.],
       [1., 2., 1., 0.],
       [0., 1., 2., 1.],
       [0., 1., 2., 1.],
       [0., 1., 2., 1.]])

In [56]:
min_maf=0.2
w_gt_arr = np.array(
    [
        [0, 1, 2, 0],
        [1, 2, 1, 0],
        [np.nan, 1, 2, np.nan],
        [2, 2, 2, 1],
        [0, 1, 2, 1],
        [0, 1, 2, 1],
    ]
)
print(w_gt_arr)


print('\n')
print(af_arr)
print('\n')
print(w_gt_arr)

[[ 0.  1.  2.  0.]
 [ 1.  2.  1.  0.]
 [nan  1.  2. nan]
 [ 2.  2.  2.  1.]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]]


[0.375 0.5   0.25  0.125 0.5   0.5  ]


[[ 0.  1.  2.  0.]
 [ 1.  2.  1.  0.]
 [nan  1.  2. nan]
 [ 0.  1.  2.  1.]
 [ 0.  1.  2.  1.]]


In [54]:
1/8

0.125

In [40]:
np.sum([True, False])

1