# Subsetting Data

The aim of this notebook is to subset data, using various filters, for analysis.

In this specific example, I will be subsetting data from chromosome 3L to produce a pool of two populations, BFGam and BFCol, from which I will generate two seperate sets for 0 and 4 fold sites.

## Load modules and data

First, we will run the universal setup notebook to ensure all necessary modules are loaded.

In [1]:
run setup.ipynb

Next, we will load in our variant callset, and set our chromosome to 3L.

In [2]:
callset = zarr.open_group("../phase2.AR1/variation/main/zarr2/ag1000g.phase2.ar1/")
chrom = "3L"
allele_counts = phase2_ar1.allele_counts
acs = phase2_ar1.accessibility[chrom]['is_accessible'][:]
pos = phase2_ar1.callset[chrom]['variants']['POS'][:]
alt = phase2_ar1.callset[chrom]['variants']['ALT'][:]


callset[chrom]['calldata'].tree()

Lets try making some allele counts using genotype data - if i can remember!

In [3]:
genotype_all = allel.GenotypeDaskArray(callset["3L/calldata/genotype"])
genotype_all

Unnamed: 0,0,1,2,3,4,...,1137,1138,1139,1140,1141,Unnamed: 12
0,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
2,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
18167053,./.,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
18167054,./.,0/0,./.,./.,./.,...,0/0,./.,./.,0/0,./.,
18167055,./.,./.,./.,./.,./.,...,./.,./.,./.,0/0,./.,


In [None]:
ac = genotype_all.count_alleles()
ac

Lets build some masks for the two populations we're studying, BFcol and BFgam. First, we need to import the sample metadata. We only need to ox_code, which acts as an index, and the population columns.

In [10]:
df_samples = pandas.read_csv('../phase2.AR1/samples/samples.meta.txt', sep='\t', usecols=['ox_code', 'population'])
df_samples.head()

Unnamed: 0,ox_code,population
0,AA0040-C,GHcol
1,AA0041-C,GHcol
2,AA0042-C,GHcol
3,AA0043-C,GHcol
4,AA0044-C,GHcol


In [None]:
Lets

In [13]:
samples = phase2_ar1.sample_ids
samples

['AA0040-C',
 'AA0041-C',
 'AA0042-C',
 'AA0043-C',
 'AA0044-C',
 'AA0048-C',
 'AA0049-C',
 'AA0050-C',
 'AA0051-C',
 'AA0052-C',
 'AA0053-C',
 'AA0054-C',
 'AA0055-C',
 'AA0056-C',
 'AA0060-C',
 'AA0061-C',
 'AA0063-C',
 'AA0064-C',
 'AA0066-C',
 'AA0067-C',
 'AA0068-C',
 'AA0072-C',
 'AA0073-C',
 'AA0074-C',
 'AA0075-C',
 'AA0076-C',
 'AA0077-C',
 'AA0080-C',
 'AA0084-C',
 'AA0085-C',
 'AA0086-C',
 'AA0087-C',
 'AA0088-C',
 'AA0089-C',
 'AA0090-C',
 'AA0091-C',
 'AA0096-C',
 'AA0097-C',
 'AA0098-C',
 'AA0099-C',
 'AA0100-C',
 'AA0101-C',
 'AA0102-C',
 'AA0103-C',
 'AA0104-C',
 'AA0107-C',
 'AA0108-C',
 'AA0109-C',
 'AA0110-C',
 'AA0111-C',
 'AA0113-C',
 'AA0114-C',
 'AA0115-C',
 'AA0116-C',
 'AA0122-C',
 'AA0123-C',
 'AA0124-C',
 'AA0125-C',
 'AA0127-C',
 'AA0132-C',
 'AA0133-C',
 'AA0134-C',
 'AA0135-C',
 'AA0136-C',
 'AA0139-C',
 'AA0140-C',
 'AA0141-C',
 'AB0085-C',
 'AB0087-C',
 'AB0088-C',
 'AB0089-C',
 'AB0090-C',
 'AB0091-C',
 'AB0092-C',
 'AB0094-C',
 'AB0095-C',
 'AB0097-C',

In [14]:
np.all(samples == df_samples['ox_code'].values)

True

In [15]:
samples_list = list(samples)
samples_callset_index = [samples_list.index(s) for s in df_samples['ox_code']]
df_samples['callset_index'] = samples_callset_index
df_samples.head()

Unnamed: 0,ox_code,population,callset_index
0,AA0040-C,GHcol,0
1,AA0041-C,GHcol,1
2,AA0042-C,GHcol,2
3,AA0043-C,GHcol,3
4,AA0044-C,GHcol,4


In [16]:
pops = ['BFcol', 'BFgam']
for x in pops:
    if x is 'BFcol':
        loc_samples_col = df_samples[df_samples.population == x].callset_index.values
    else: 
        loc_samples_gam = df_samples[df_samples.population == x].callset_index.values
        
loc_samples_unsorted = np.concatenate([loc_samples_col, loc_samples_gam])
loc_samples_BF = np.sort(loc_samples_unsorted)

In [17]:
loc_samples_BF

array([ 67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
       158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
       171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
       184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233])

In [18]:
len(loc_samples_BF)

167

In [None]:
gt_BF = genotype_all.compress(loc_samples_BF).compute()
gt_BF

Now, let choose our subset.

In [12]:
BF_samples = df_samples[df_samples.population == {'BFcol', 'BFgam'}].callset_index.values
BF_samples

AttributeError: 'DataFrame' object has no attribute 'callset_index'

In [None]:
def new_filter_tot():
    #biallelic for all populations
    BFg_bia = (BFg_ac[:,2]<=0)
    BFc_bia = (BFc_ac[:,2]<=0)
    #KE_bia = (KE_ac[:,2]<=0)
    bia_tot = BFg_bia & BFc_bia #& KE_bia
    
    #All individuals present
    BFg_full = (BFg_b[:,0]==t4)
    BFc_full = (BFc_b[:,0]==t5)
    KE_full = (KE_b[:,0]==t14)
    full_tot = BFg_full & BFc_full #& KE_full
                
    #Valid Outgroup Sites
    #sum_a = np.sum(a_ac, axis=1)>= 1
    #filter_a = sum_a & np.invert((a_ac[:,0]>0) & (a_alt[:,0]=='N'))
    #sum_q = np.sum(q_ac, axis=1)>= 1
    #filter_q = sum_q & np.invert((q_ac[:,0]>0) & (q_alt[:,0] =='N'))
    #sum_mer = np.sum(mer_ac, axis=1)>= 1
    #filter_mer = sum_mer & np.invert((mer_ac[:,0]>0) & (mer_alt[:,0] =='N'))                           
    #sum_mel = np.sum(mel_ac, axis=1)>= 1
    #filter_mel = sum_mel & np.invert((mel_ac[:,0]>0) & (mel_alt[:,0] =='N'))
    #filter_c = (c_alt[:,0]!='N')
    #filter_e = (e_alt[:,0]!='N')
    #outgroup_full = filter_a & filter_q & filter_mer & filter_mel & filter_c & filter_e
    
    #Accessibility 
    acs = phase2_ar1.accessibility[seqid]['is_accessible'][:]
    
    #Total Filter
    filter_new_tot = acs & bia_tot & full_tot & outgroup_full
    
    #codon degeneracy - 0-fold and 4-fold filters
    fourfold = codon_degeneracy==4
    zerofold = codon_degeneracy==1
    filter_new_0 = filter_new_tot & zerofold 
    filter_new_4 = filter_new_tot & fourfold
    
    return filter_new_tot, filter_new_0, filter_new_4

new_filter_tot, new_filter_0, new_filter_4 = new_filter_tot()

In [9]:
doubletons = ac.is_doubleton
doubletons

<bound method AlleleCountsDaskArray.is_doubleton of <AlleleCountsDaskArray shape=(18167056, 4) dtype=int32>>