<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from utils.config import *
local_data_folder = get_local_data_folder('hgdp')
local_data_folder

'C:/Data/HUJI/hgdp/'

In [2]:
# todo - think about the infra to get to each file, and naming conventions

# Process 012 files

In [3]:
#imports and consts
import os.path
import logging
import pandas as pd
import math

SUFFIX_POS = '.012.pos'
SUFFIX_012 = '.012'
OUTPUT_PATTERN_DIST_FILE = 'dist_slice_{slice_index}_{num_of_sites}_sites.tsv'
OUTPUT_PATTERN_SITES_FILE = 'sites_ids_slice_{slice_index}.tsv'
OUTPUT_PATTERN_LOG_FILE = 'Process012_{chr_name}_MAF_{maf}.log'

logger = logging.getLogger(__name__)

def _set_logger(chr_name, maf):
    log_file = OUTPUT_PATTERN_LOG_FILE.format(chr_name=chr_name, maf=maf)
    fhandler = logging.FileHandler(filename=log_file, mode='a')
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fhandler.setFormatter(formatter)
    # TODO - file is empty!
    logger.addHandler(fhandler)

def _log(msg, level=logging.INFO):
    print(msg)
    logger.log(level, msg)

In [4]:
# func params
path_to_012_files = 'C:\Data\HUJI\hgdp\mock\out'
base_output_path = 'C:\Data\HUJI\hgdp\mock'
# if not in range - will raise a warning
min_minor_freq_expected = 0.49
max_minor_freq_expected = 0.5
# if we have 91 sites, we may have a file with 1 sites in it (can be improved if we want..)
# TODO - validate with Gili
desired_num_of_sites_per_output = 10
# TODO - validate with Gili
# if we have less than this which are valied (not -1), site is not included in calc. This will change the 
min_valid_sites_precentage = 0.1

In [5]:
def _validate_file_exist(p):
    if not os.path.isfile(p):
        _log(f'Cant file file {p}', logging.ERROR)

In [6]:
def _get_chr_name(path_pos):
    with open(path_pos, 'r') as f:
        l = f.readline()
        char_name = l.split('\t')[0]
        return char_name

In [7]:
def _prepare_output_folder(base_output_path, chr_name):
    output_dir = f'{base_output_path}/{chr_name}/'
    _log(f'output_dir is {output_dir}')
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


In [8]:
def _build_pairwise_db(n, v):
    return [[v]*(n-i) for i in range (1,n)]

In [9]:
def _slice_calc_pairwise_distances_with_guardrails(slice_index, num_of_slices, slice_df, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected):
    # for each column, we calc the pairwise distances and add it to the grand total
    # for performance, we use 2 lists of lists, one for distances and one for counts
    slice_pairwise_counts = _build_pairwise_db(len(slice_df), 0)
    slice_pairwise_dist = _build_pairwise_db(len(slice_df), 0.0)
    for site_index in range(len(slice_df.columns)):
        _site_calc_pairwise_distances_with_guardrails(slice_index, num_of_slices, site_index, slice_df, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected, slice_pairwise_counts, slice_pairwise_dist)
    return slice_pairwise_counts, slice_pairwise_dist

In [10]:
def _site_calc_pairwise_distances(slice_index, num_of_slices, site_index, genotypes, num_individuals, ref_freq, non_ref_freq, slice_pairwise_counts, slice_pairwise_dist):
    for i1 in range(num_individuals-1):
        i1_val = genotypes[i1]
        # if this entry is not valid for i1, no need to go over all the others, nothing to add to freq nor counts
        if i1%100==0:
            _log(f'Slice: {slice_index}/{num_of_slices}. Site: {site_index}. Done with individual {i1}/{len(genotypes)}')
        if i1_val == -1:
            continue
        for i2 in range(i1+1, num_individuals):
            i2_val = genotypes[i2]
            if i2_val == -1:
                continue
            else:            
                # this is a valid entry, we add 1 to the count
                slice_pairwise_counts[i1][i2-i1-1] += 1
                slice_pairwise_dist[i1][i2-i1-1] += _calc_dist(i1_val, i2_val, ref_freq, non_ref_freq)

def _site_calc_pairwise_distances_with_guardrails(slice_index, num_of_slices, site_index, slice_df, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected, slice_pairwise_counts, slice_pairwise_dist):
    genotypes = slice_df.iloc[:,site_index].values
    # get counts
    num_individuals = len(genotypes)
    # count the amount of not -1 in alleles
    num_valid_genotypes = len(genotypes[genotypes!=-1])
    non_ref_count = sum(genotypes[genotypes>0])
    ref_count = 2*num_valid_genotypes-non_ref_count
    non_ref_freq = float(non_ref_count)/(2*num_valid_genotypes)
    ref_freq = float(ref_count)/(2*num_valid_genotypes)
    _log(f'Site index: {site_index}, non ref allele frequency: {non_ref_freq}')
    _log(f'Site index: {site_index}, ref allele frequency: {ref_freq}')
    # guardrails 
    assert abs(ref_freq+non_ref_freq-1)<1e-04
    _check_guardrails(num_individuals, num_valid_genotypes, ref_count, non_ref_count, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected)

    # now when we have all freq and counts, we can start the pairwise comparison
    # takes 0.5 seconds for one site. This means ~4 days if we are not seperating the task (which we will)
    # 178 sites took arounc 2 minutes.
    # Note that if we split this by chromozome, the big ones may have around 1000 in a given freq range, 
    # which can take ~8 minutes. 10k will be around 83 minutes - also cool.
    _site_calc_pairwise_distances(slice_index, num_of_slices, site_index, genotypes, num_individuals, ref_freq, non_ref_freq, slice_pairwise_counts, slice_pairwise_dist)


In [11]:
def _test(v1_val, v2_val):
    print(v1_val, v2_val, f'(1-f_a){(v1_val*v2_val)} + (1-f_r){(2-v1_val)*(2-v2_val)}')
for v1 in range(3):
    for v2 in range(v1,3):
        _test(v1, v2)

0 0 (1-f_a)0 + (1-f_r)4
0 1 (1-f_a)0 + (1-f_r)2
0 2 (1-f_a)0 + (1-f_r)0
1 1 (1-f_a)1 + (1-f_r)1
1 2 (1-f_a)2 + (1-f_r)0
2 2 (1-f_a)4 + (1-f_r)0


In [12]:
def _calc_dist(i1_val, i2_val, ref_freq, non_ref_freq):
    # from VCFtools manual:
    # "Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles"
    # So - v1_val and v2_val are the amount of non-ref alleles.
    # The distance function is:
    # 1/4[(1-f_a)(Iac+Iad) + (1-f_b)(Ibc+Ibd)]
    # Now, for each combination of v1_val and v2_val, we can compute the distance.
        
    # 0,0 - v1_val=0 and v2_val=0:
    #    1/4[(1-f_ref)(1+1) + (1-f_ref)(1+1)] = 1/4[(1-f_ref)(4)] = (1-f_ref)
    
    # 0,1 - v1_val=0 and v2_val=1:
    #    1/4[(1-f_ref)(1+0) + (1-f_ref)(1+0)] = 1/4[(1-f_ref)(2)] = 1/2(1-f_ref)
    
    # 0,2 - v1_val=0 and v2_val=2:
    #    1/4[(1-f_ref)(0+0) + (1-f_ref)(0+0)] = 0
    
    # 1,1 - v1_val=1 and v2_val=2:
    #    1/4[(1-f_non_ref)(1+0) + (1-f_ref)(1+0)] = 1/4[(1-f_non_ref)+(1-f_ref)]
    
    # 1,2 - v1_val=1 and v2_val=2:
    #    1/4[(1-f_non_ref)(1+1) + (1-f_ref)(0+0)] = 1/4[(1-f_non_ref)(2)] = 1/2(1-f_non_ref)
    
    # 2,2 - v1_val=2 and v2_val=2:
    #    1/4[(1-f_non_ref)(1+1) + (1-f_non_ref)(1+1)] = 1/4[(1-f_non_ref)(4)] = (1-f_non_ref)
    
    # Also, note that it is symetric:

    # 1,0 - v1_val=1 and v2_val=0:
    #    1/4[(1-f_ref)(1+1) + (1-f_non_ref)(0+0)] = 1/4[(1-f_ref)(2)] = 1/2(1-f_ref)
    
    # 2,1 - v1_val=2 and v2_val=1:
    #    1/4[(1-f_non_ref)(1+0) + (1-f_non_ref)(0+1)] = 1/4[(1-f_non_ref)(2)] = 1/2(1-f_non_ref)

    # 2,0 - v1_val=2 and v2_val=0:
    #    1/4[(1-f_non_ref)(0+0) + (1-f_non_ref)(0+0)] = 0
    
    # formula to use: 
    #   (1-f_non_ref)(v1_val*v2_val) + (1-f_ref)((2-v1_val)*(2-v2_val))/4
    return (1-non_ref_freq)*(i1_val*i2_val) + (1-ref_freq)*((2-i1_val)*(2-i2_val))/4
    

In [13]:
def _check_guardrails(num_individuals, num_valid_genotypes, ref_count, non_ref_count, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected):
    _log(f'Check guardrails')
    # guardrail #1 - min_valid_sites_precentage
    percentage_valid_sites = float(num_valid_genotypes)/num_individuals
    _log(f'Precentage of valid sites: {percentage_valid_sites}')
    if percentage_valid_sites < min_valid_sites_precentage:
        _log(f'ERROR: % of valid sites is {percentage_valid_sites}, lower than allowd: {min_valid_sites_precentage}.', logging.ERROR)
    # guardrail #2 - min_minor_freq_expected and max_minor_freq_expected
    minor_count = min(ref_count, non_ref_count)
    minor_freq = float(minor_count)/(2*num_valid_genotypes)
    _log(f'Minor allele frequency: {minor_freq}')
    if minor_freq < min_minor_freq_expected:
        _log(f'ERROR: minor frequency is too low - {minor_freq}, allowd: {min_minor_freq_expected}.', logging.ERROR)
    if minor_freq > max_minor_freq_expected:
        _log(f'ERROR: minor frequency is too high - {minor_freq}, allowd: {max_minor_freq_expected}.', logging.ERROR)
    _log(f'Passed guardrails')
    

In [14]:
def _write_pairwise_distances(output_dist_file, slice_pairwise_counts, slice_pairwise_dist, num_of_sites):
    #TODO
    print('todo')
    
def _write_output_sites_file(output_sites_file, slice_df):
    # TODO - output per site the calculated number of missing values and MAF
    sites_ids = slice_df.columns.values
    with open(output_sites_file, 'w') as f:
        for site_id in sites_ids:
            f.write(site_id.replace('_','\t') + '\n')


# Func begin

In [15]:
def TODO_NAME(path_to_012_files, base_output_path, desired_num_of_sites_per_output, min_minor_freq_expected, max_minor_freq_expected):
    # outputs:
    # for each group i in 1,...,k:
    # 1. {i}_dist_{l}_sites : tab seperated pairwise distances - this is the sum over the valid sites used, and the format in each entry is <sum>,<l'>, or <sum> if l'=l (from the file's name)
    # the idea here is that in most cases all entries are valid for both individuals, and there is no need to store this value. In the cases there are l'>l valid sites, we add l' to the output in the entry.
    # dummy example for 3 individuals, over l=5, where individual number 3 has 2 invalid entries (-1 -1 1 0 2)
    # 0.9 0.7,3
    # 0.8,3 
    # the ',3' indicates that there are only 3 valied sited for individuals 1 and 3 and 2 and 3
    # 2. {i}_sites_ids : <site_id> <number of non -1 entris>


    # Validate inputs
    path_012 = f'{path_to_012_files}{SUFFIX_012}'
    path_pos = f'{path_to_012_files}{SUFFIX_POS}'
    chr_name = _get_chr_name(path_pos)

    _set_logger(chr_name, min_minor_freq_expected)

    _validate_file_exist(path_012)
    _validate_file_exist(path_pos)

    # prepare output folder
    _log(f'chr_name is {chr_name}')
    output_dir = _prepare_output_folder(base_output_path, chr_name)

    # load the data
    # get sites positions to use as column names
    with open(path_pos, 'r') as f:
        sites_pos = [l.replace('\t','_').replace('\n','') for l in f.readlines()]
    all_df = pd.read_csv(path_012, sep='\t', names = ['ind_id'] + sites_pos)
    # drop first column - individual ids
    all_df = all_df.drop(columns=['ind_id'])
    num_sites = len(all_df.columns)
    num_of_slices = int(float(num_sites)/desired_num_of_sites_per_output)
    size_of_last_slice = num_sites % desired_num_of_sites_per_output
    _log(f'Total num of sites: {num_sites}')
    _log(f'Desired size of slice: {desired_num_of_sites_per_output}')
    _log(f'Number of slices: {num_of_slices+1}')
    _log(f'Size of last slice: {size_of_last_slice}')

    for slice_index in  range(num_of_slices+1):
        _log(f'{chr_name}: slice {slice_index}/{num_of_slices}')

        min_col_index = slice_index*desired_num_of_sites_per_output
        # in case we have less than desired_num_of_sites_per_output, pandas will only take those we have!
        max_col_index = min_col_index + desired_num_of_sites_per_output
        slice_df = all_df.iloc[:,min_col_index:max_col_index]
        num_of_sites = len(slice_df.columns)
        _log(f'\n#############\n')
        _log(f'Slice index: {slice_index}. Slice size: {num_of_sites} ')
        output_dist_file = output_dir + OUTPUT_PATTERN_DIST_FILE.format(slice_index=slice_index, num_of_sites=num_of_sites)
        output_sites_file = output_dir + OUTPUT_PATTERN_SITES_FILE.format(slice_index=slice_index)
        # todo - maybe write to this file also the number of -1, 0, 1 and 2?
        _write_output_sites_file(output_sites_file, slice_df)
        _log(f'Output files: {output_dist_file} {output_sites_file}')
        slice_pairwise_counts, slice_pairwise_dist = _slice_calc_pairwise_distances_with_guardrails(slice_index, num_of_slices, slice_df, min_valid_sites_precentage, min_minor_freq_expected, max_minor_freq_expected)
        _write_pairwise_distances(output_dist_file, slice_pairwise_counts, slice_pairwise_dist, num_of_sites)

In [16]:
TODO_NAME(path_to_012_files, base_output_path, desired_num_of_sites_per_output=100, min_minor_freq_expected=0.49, max_minor_freq_expected=0.5)

chr_name is chr9
output_dir is C:\Data\HUJI\hgdp\mock/chr9/
Total num of sites: 178
Desired size of slice: 100
Number of slices: 2
Size of last slice: 78
chr9: slice 0/1

#############

Slice index: 0. Slice size: 100 
Output files: C:\Data\HUJI\hgdp\mock/chr9/dist_slice_0_100_sites.tsv C:\Data\HUJI\hgdp\mock/chr9/sites_ids_slice_0.tsv
Site index: 0, non ref allele frequency: 0.5
Site index: 0, ref allele frequency: 0.5
Check guardrails
Precentage of valid sites: 0.4951560818083961
Minor allele frequency: 0.5
Passed guardrails
Slice: 0/1. Site: 0. Done with individual 0/929
Slice: 0/1. Site: 0. Done with individual 100/929
Slice: 0/1. Site: 0. Done with individual 200/929
Slice: 0/1. Site: 0. Done with individual 300/929
Slice: 0/1. Site: 0. Done with individual 400/929
Slice: 0/1. Site: 0. Done with individual 500/929
Slice: 0/1. Site: 0. Done with individual 600/929
Slice: 0/1. Site: 0. Done with individual 700/929
Slice: 0/1. Site: 0. Done with individual 800/929
Slice: 0/1. Site: 0

Slice: 0/1. Site: 11. Done with individual 200/929
Slice: 0/1. Site: 11. Done with individual 300/929
Slice: 0/1. Site: 11. Done with individual 400/929
Slice: 0/1. Site: 11. Done with individual 500/929
Slice: 0/1. Site: 11. Done with individual 600/929
Slice: 0/1. Site: 11. Done with individual 700/929
Slice: 0/1. Site: 11. Done with individual 800/929
Slice: 0/1. Site: 11. Done with individual 900/929
Site index: 12, non ref allele frequency: 0.4919786096256685
Site index: 12, ref allele frequency: 0.5080213903743316
Check guardrails
Precentage of valid sites: 0.20129171151776104
Minor allele frequency: 0.4919786096256685
Passed guardrails
Slice: 0/1. Site: 12. Done with individual 0/929
Slice: 0/1. Site: 12. Done with individual 100/929
Slice: 0/1. Site: 12. Done with individual 200/929
Slice: 0/1. Site: 12. Done with individual 300/929
Slice: 0/1. Site: 12. Done with individual 400/929
Slice: 0/1. Site: 12. Done with individual 500/929
Slice: 0/1. Site: 12. Done with individual 60

Site index: 23, non ref allele frequency: 0.49099099099099097
Site index: 23, ref allele frequency: 0.509009009009009
Check guardrails
Precentage of valid sites: 0.11948331539289558
Minor allele frequency: 0.49099099099099097
Passed guardrails
Slice: 0/1. Site: 23. Done with individual 0/929
Slice: 0/1. Site: 23. Done with individual 100/929
Slice: 0/1. Site: 23. Done with individual 200/929
Slice: 0/1. Site: 23. Done with individual 300/929
Slice: 0/1. Site: 23. Done with individual 400/929
Slice: 0/1. Site: 23. Done with individual 500/929
Slice: 0/1. Site: 23. Done with individual 600/929
Slice: 0/1. Site: 23. Done with individual 700/929
Slice: 0/1. Site: 23. Done with individual 800/929
Slice: 0/1. Site: 23. Done with individual 900/929
Site index: 24, non ref allele frequency: 0.4976190476190476
Site index: 24, ref allele frequency: 0.5023809523809524
Check guardrails
Precentage of valid sites: 0.22604951560818085
Minor allele frequency: 0.4976190476190476
Passed guardrails
Slice

Slice: 0/1. Site: 34. Done with individual 300/929
Slice: 0/1. Site: 34. Done with individual 400/929
Slice: 0/1. Site: 34. Done with individual 500/929
Slice: 0/1. Site: 34. Done with individual 600/929
Slice: 0/1. Site: 34. Done with individual 700/929
Slice: 0/1. Site: 34. Done with individual 800/929
Slice: 0/1. Site: 34. Done with individual 900/929
Site index: 35, non ref allele frequency: 0.49012158054711247
Site index: 35, ref allele frequency: 0.5098784194528876
Check guardrails
Precentage of valid sites: 0.7082884822389667
Minor allele frequency: 0.49012158054711247
Passed guardrails
Slice: 0/1. Site: 35. Done with individual 0/929
Slice: 0/1. Site: 35. Done with individual 100/929
Slice: 0/1. Site: 35. Done with individual 200/929
Slice: 0/1. Site: 35. Done with individual 300/929
Slice: 0/1. Site: 35. Done with individual 400/929
Slice: 0/1. Site: 35. Done with individual 500/929
Slice: 0/1. Site: 35. Done with individual 600/929
Slice: 0/1. Site: 35. Done with individual 7

Slice: 0/1. Site: 45. Done with individual 700/929
Slice: 0/1. Site: 45. Done with individual 800/929
Slice: 0/1. Site: 45. Done with individual 900/929
Site index: 46, non ref allele frequency: 0.4929742388758782
Site index: 46, ref allele frequency: 0.5070257611241218
Check guardrails
Precentage of valid sites: 0.4596340150699677
Minor allele frequency: 0.4929742388758782
Passed guardrails
Slice: 0/1. Site: 46. Done with individual 0/929
Slice: 0/1. Site: 46. Done with individual 100/929
Slice: 0/1. Site: 46. Done with individual 200/929
Slice: 0/1. Site: 46. Done with individual 300/929
Slice: 0/1. Site: 46. Done with individual 400/929
Slice: 0/1. Site: 46. Done with individual 500/929
Slice: 0/1. Site: 46. Done with individual 600/929
Slice: 0/1. Site: 46. Done with individual 700/929
Slice: 0/1. Site: 46. Done with individual 800/929
Slice: 0/1. Site: 46. Done with individual 900/929
Site index: 47, non ref allele frequency: 0.4991103202846975
Site index: 47, ref allele frequency

Slice: 0/1. Site: 57. Done with individual 200/929
Slice: 0/1. Site: 57. Done with individual 300/929
Slice: 0/1. Site: 57. Done with individual 400/929
Slice: 0/1. Site: 57. Done with individual 500/929
Slice: 0/1. Site: 57. Done with individual 600/929
Slice: 0/1. Site: 57. Done with individual 700/929
Slice: 0/1. Site: 57. Done with individual 800/929
Slice: 0/1. Site: 57. Done with individual 900/929
Site index: 58, non ref allele frequency: 0.502127659574468
Site index: 58, ref allele frequency: 0.4978723404255319
Check guardrails
Precentage of valid sites: 0.5059203444564048
Minor allele frequency: 0.4978723404255319
Passed guardrails
Slice: 0/1. Site: 58. Done with individual 0/929
Slice: 0/1. Site: 58. Done with individual 100/929
Slice: 0/1. Site: 58. Done with individual 200/929
Slice: 0/1. Site: 58. Done with individual 300/929
Slice: 0/1. Site: 58. Done with individual 400/929
Slice: 0/1. Site: 58. Done with individual 500/929
Slice: 0/1. Site: 58. Done with individual 600/

Slice: 0/1. Site: 68. Done with individual 400/929
Slice: 0/1. Site: 68. Done with individual 500/929
Slice: 0/1. Site: 68. Done with individual 600/929
Slice: 0/1. Site: 68. Done with individual 700/929
Slice: 0/1. Site: 68. Done with individual 800/929
Slice: 0/1. Site: 68. Done with individual 900/929
Site index: 69, non ref allele frequency: 0.49377593360995853
Site index: 69, ref allele frequency: 0.5062240663900415
Check guardrails
Precentage of valid sites: 0.25941872981700753
Minor allele frequency: 0.49377593360995853
Passed guardrails
Slice: 0/1. Site: 69. Done with individual 0/929
Slice: 0/1. Site: 69. Done with individual 100/929
Slice: 0/1. Site: 69. Done with individual 200/929
Slice: 0/1. Site: 69. Done with individual 300/929
Slice: 0/1. Site: 69. Done with individual 400/929
Slice: 0/1. Site: 69. Done with individual 500/929
Slice: 0/1. Site: 69. Done with individual 600/929
Slice: 0/1. Site: 69. Done with individual 700/929
Slice: 0/1. Site: 69. Done with individual 

Slice: 0/1. Site: 80. Done with individual 100/929
Slice: 0/1. Site: 80. Done with individual 200/929
Slice: 0/1. Site: 80. Done with individual 300/929
Slice: 0/1. Site: 80. Done with individual 400/929
Slice: 0/1. Site: 80. Done with individual 500/929
Slice: 0/1. Site: 80. Done with individual 600/929
Slice: 0/1. Site: 80. Done with individual 700/929
Slice: 0/1. Site: 80. Done with individual 800/929
Slice: 0/1. Site: 80. Done with individual 900/929
Site index: 81, non ref allele frequency: 0.5043668122270742
Site index: 81, ref allele frequency: 0.49563318777292575
Check guardrails
Precentage of valid sites: 0.2465016146393972
Minor allele frequency: 0.49563318777292575
Passed guardrails
Slice: 0/1. Site: 81. Done with individual 0/929
Slice: 0/1. Site: 81. Done with individual 100/929
Slice: 0/1. Site: 81. Done with individual 200/929
Slice: 0/1. Site: 81. Done with individual 300/929
Slice: 0/1. Site: 81. Done with individual 400/929
Slice: 0/1. Site: 81. Done with individual 5

Slice: 0/1. Site: 92. Done with individual 100/929
Slice: 0/1. Site: 92. Done with individual 200/929
Slice: 0/1. Site: 92. Done with individual 300/929
Slice: 0/1. Site: 92. Done with individual 400/929
Slice: 0/1. Site: 92. Done with individual 500/929
Slice: 0/1. Site: 92. Done with individual 600/929
Slice: 0/1. Site: 92. Done with individual 700/929
Slice: 0/1. Site: 92. Done with individual 800/929
Slice: 0/1. Site: 92. Done with individual 900/929
Site index: 93, non ref allele frequency: 0.5041551246537396
Site index: 93, ref allele frequency: 0.49584487534626037
Check guardrails
Precentage of valid sites: 0.38858988159311086
Minor allele frequency: 0.49584487534626037
Passed guardrails
Slice: 0/1. Site: 93. Done with individual 0/929
Slice: 0/1. Site: 93. Done with individual 100/929
Slice: 0/1. Site: 93. Done with individual 200/929
Slice: 0/1. Site: 93. Done with individual 300/929
Slice: 0/1. Site: 93. Done with individual 400/929
Slice: 0/1. Site: 93. Done with individual 

Slice: 1/1. Site: 3. Done with individual 200/929
Slice: 1/1. Site: 3. Done with individual 300/929
Slice: 1/1. Site: 3. Done with individual 400/929
Slice: 1/1. Site: 3. Done with individual 500/929
Slice: 1/1. Site: 3. Done with individual 600/929
Slice: 1/1. Site: 3. Done with individual 700/929
Slice: 1/1. Site: 3. Done with individual 800/929
Slice: 1/1. Site: 3. Done with individual 900/929
Site index: 4, non ref allele frequency: 0.5094850948509485
Site index: 4, ref allele frequency: 0.4905149051490515
Check guardrails
Precentage of valid sites: 0.39720129171151775
Minor allele frequency: 0.4905149051490515
Passed guardrails
Slice: 1/1. Site: 4. Done with individual 0/929
Slice: 1/1. Site: 4. Done with individual 100/929
Slice: 1/1. Site: 4. Done with individual 200/929
Slice: 1/1. Site: 4. Done with individual 300/929
Slice: 1/1. Site: 4. Done with individual 400/929
Slice: 1/1. Site: 4. Done with individual 500/929
Slice: 1/1. Site: 4. Done with individual 600/929
Slice: 1/1.

Slice: 1/1. Site: 14. Done with individual 400/929
Slice: 1/1. Site: 14. Done with individual 500/929
Slice: 1/1. Site: 14. Done with individual 600/929
Slice: 1/1. Site: 14. Done with individual 700/929
Slice: 1/1. Site: 14. Done with individual 800/929
Slice: 1/1. Site: 14. Done with individual 900/929
Site index: 15, non ref allele frequency: 0.5078125
Site index: 15, ref allele frequency: 0.4921875
Check guardrails
Precentage of valid sites: 0.6889128094725512
Minor allele frequency: 0.4921875
Passed guardrails
Slice: 1/1. Site: 15. Done with individual 0/929
Slice: 1/1. Site: 15. Done with individual 100/929
Slice: 1/1. Site: 15. Done with individual 200/929
Slice: 1/1. Site: 15. Done with individual 300/929
Slice: 1/1. Site: 15. Done with individual 400/929
Slice: 1/1. Site: 15. Done with individual 500/929
Slice: 1/1. Site: 15. Done with individual 600/929
Slice: 1/1. Site: 15. Done with individual 700/929
Slice: 1/1. Site: 15. Done with individual 800/929
Slice: 1/1. Site: 15. 

Slice: 1/1. Site: 25. Done with individual 600/929
Slice: 1/1. Site: 25. Done with individual 700/929
Slice: 1/1. Site: 25. Done with individual 800/929
Slice: 1/1. Site: 25. Done with individual 900/929
Site index: 26, non ref allele frequency: 0.49683544303797467
Site index: 26, ref allele frequency: 0.5031645569620253
Check guardrails
Precentage of valid sites: 0.17007534983853606
Minor allele frequency: 0.49683544303797467
Passed guardrails
Slice: 1/1. Site: 26. Done with individual 0/929
Slice: 1/1. Site: 26. Done with individual 100/929
Slice: 1/1. Site: 26. Done with individual 200/929
Slice: 1/1. Site: 26. Done with individual 300/929
Slice: 1/1. Site: 26. Done with individual 400/929
Slice: 1/1. Site: 26. Done with individual 500/929
Slice: 1/1. Site: 26. Done with individual 600/929
Slice: 1/1. Site: 26. Done with individual 700/929
Slice: 1/1. Site: 26. Done with individual 800/929
Slice: 1/1. Site: 26. Done with individual 900/929
Site index: 27, non ref allele frequency: 0

Slice: 1/1. Site: 38. Done with individual 100/929
Slice: 1/1. Site: 38. Done with individual 200/929
Slice: 1/1. Site: 38. Done with individual 300/929
Slice: 1/1. Site: 38. Done with individual 400/929
Slice: 1/1. Site: 38. Done with individual 500/929
Slice: 1/1. Site: 38. Done with individual 600/929
Slice: 1/1. Site: 38. Done with individual 700/929
Slice: 1/1. Site: 38. Done with individual 800/929
Slice: 1/1. Site: 38. Done with individual 900/929
Site index: 39, non ref allele frequency: 0.5059523809523809
Site index: 39, ref allele frequency: 0.49404761904761907
Check guardrails
Precentage of valid sites: 0.36167922497308935
Minor allele frequency: 0.49404761904761907
Passed guardrails
Slice: 1/1. Site: 39. Done with individual 0/929
Slice: 1/1. Site: 39. Done with individual 100/929
Slice: 1/1. Site: 39. Done with individual 200/929
Slice: 1/1. Site: 39. Done with individual 300/929
Slice: 1/1. Site: 39. Done with individual 400/929
Slice: 1/1. Site: 39. Done with individual 

Slice: 1/1. Site: 49. Done with individual 300/929
Slice: 1/1. Site: 49. Done with individual 400/929
Slice: 1/1. Site: 49. Done with individual 500/929
Slice: 1/1. Site: 49. Done with individual 600/929
Slice: 1/1. Site: 49. Done with individual 700/929
Slice: 1/1. Site: 49. Done with individual 800/929
Slice: 1/1. Site: 49. Done with individual 900/929
Site index: 50, non ref allele frequency: 0.49700598802395207
Site index: 50, ref allele frequency: 0.5029940119760479
Check guardrails
Precentage of valid sites: 0.7190527448869752
Minor allele frequency: 0.49700598802395207
Passed guardrails
Slice: 1/1. Site: 50. Done with individual 0/929
Slice: 1/1. Site: 50. Done with individual 100/929
Slice: 1/1. Site: 50. Done with individual 200/929
Slice: 1/1. Site: 50. Done with individual 300/929
Slice: 1/1. Site: 50. Done with individual 400/929
Slice: 1/1. Site: 50. Done with individual 500/929
Slice: 1/1. Site: 50. Done with individual 600/929
Slice: 1/1. Site: 50. Done with individual 7

Slice: 1/1. Site: 60. Done with individual 200/929
Slice: 1/1. Site: 60. Done with individual 300/929
Slice: 1/1. Site: 60. Done with individual 400/929
Slice: 1/1. Site: 60. Done with individual 500/929
Slice: 1/1. Site: 60. Done with individual 600/929
Slice: 1/1. Site: 60. Done with individual 700/929
Slice: 1/1. Site: 60. Done with individual 800/929
Slice: 1/1. Site: 60. Done with individual 900/929
Site index: 61, non ref allele frequency: 0.49117647058823527
Site index: 61, ref allele frequency: 0.5088235294117647
Check guardrails
Precentage of valid sites: 0.7319698600645855
Minor allele frequency: 0.49117647058823527
Passed guardrails
Slice: 1/1. Site: 61. Done with individual 0/929
Slice: 1/1. Site: 61. Done with individual 100/929
Slice: 1/1. Site: 61. Done with individual 200/929
Slice: 1/1. Site: 61. Done with individual 300/929
Slice: 1/1. Site: 61. Done with individual 400/929
Slice: 1/1. Site: 61. Done with individual 500/929
Slice: 1/1. Site: 61. Done with individual 6

Slice: 1/1. Site: 71. Done with individual 300/929
Slice: 1/1. Site: 71. Done with individual 400/929
Slice: 1/1. Site: 71. Done with individual 500/929
Slice: 1/1. Site: 71. Done with individual 600/929
Slice: 1/1. Site: 71. Done with individual 700/929
Slice: 1/1. Site: 71. Done with individual 800/929
Slice: 1/1. Site: 71. Done with individual 900/929
Site index: 72, non ref allele frequency: 0.491869918699187
Site index: 72, ref allele frequency: 0.508130081300813
Check guardrails
Precentage of valid sites: 0.5296017222820237
Minor allele frequency: 0.491869918699187
Passed guardrails
Slice: 1/1. Site: 72. Done with individual 0/929
Slice: 1/1. Site: 72. Done with individual 100/929
Slice: 1/1. Site: 72. Done with individual 200/929
Slice: 1/1. Site: 72. Done with individual 300/929
Slice: 1/1. Site: 72. Done with individual 400/929
Slice: 1/1. Site: 72. Done with individual 500/929
Slice: 1/1. Site: 72. Done with individual 600/929
Slice: 1/1. Site: 72. Done with individual 700/92

In [19]:
s = 'vcftools --gzvcf vcf_file --out output_folder --max-alleles 2 --min-alleles 2 --freq2'
print (s.split())

['vcftools', '--gzvcf', 'vcf_file', '--out', 'output_folder', '--max-alleles', '2', '--min-alleles', '2', '--freq2']


In [23]:
for i in range(21):
    print('chr'+str(i))

chr0
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
