# Subsetting Data

The aim of this notebook is to subset data, using various filters, for analysis.

In this specific example, I will be subsetting data from chromosome 3L to produce a pool of two populations, BFGam and BFCol, from which I will generate two seperate sets for 0 and 4 fold sites.

## Load modules and data

First, we will run the universal setup notebook to ensure all necessary modules are loaded.

In [1]:
run setup.ipynb

Next, we will load in our variant callset, and set our chromosome to 3L.

In [2]:
callset = zarr.open_group("../phase2.AR1/variation/main/zarr2/ag1000g.phase2.ar1/")
chrom = "3L"
allele_counts = phase2_ar1.allele_counts
acs = phase2_ar1.accessibility[chrom]['is_accessible'][:]
pos = phase2_ar1.callset[chrom]['variants']['POS'][:]
alt = phase2_ar1.callset[chrom]['variants']['ALT'][:]


callset[chrom]['calldata'].tree()

#gt = zarr.open_group('../phase2.AR1/variation/main/zarr2/ag1000g.phase2.ar1/3L/calldata/')
#gt.tree()

In [10]:
x = allele_counts['3L/BFgam'][:]
x

array([[  0,   2,   0,   0],
       [  0,   2,   0,   0],
       [  0,   2,   0,   0],
       ...,
       [136,   0,   0,   0],
       [ 46,   0,   0,   0],
       [ 18,   0,   0,   0]], dtype=uint32)

Lets try making some allele counts using genotype data - if i can remember!

In [32]:
genotype_all = allel.GenotypeChunkedArray(callset[chrom]['calldata']['genotype'])
genotype_all

Unnamed: 0,0,1,2,3,4,...,1137,1138,1139,1140,1141,Unnamed: 12
0,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
1,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
2,./.,./.,./.,./.,./.,...,./.,./.,./.,./.,./.,
...,...,...,...,...,...,...,...,...,...,...,...,...
18167053,./.,0/0,./.,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
18167054,./.,0/0,./.,./.,./.,...,0/0,./.,./.,0/0,./.,
18167055,./.,./.,./.,./.,./.,...,./.,./.,./.,0/0,./.,


Now, we will load in the necessary files to generate our filters. This consists of the sample metadata, to enable a selection of populations, and the codon_degeneracy table generated by build_codon_degeneracy.ipynb.

In [3]:
df_samples = pandas.read_csv('../phase2.AR1/samples/samples.meta.txt', sep='\t', index_col='ox_code')
df_samples.head()

Unnamed: 0_level_0,src_code,population,country,location,site,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,ebi_sample_acc,latitude,longitude
ox_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95033368,30.99,ERS311878,5.60858,-1.54926
AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95843804,31.7,ERS311886,5.60858,-1.54926
AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,107420666,35.65,ERS311894,4.91217,-1.77397
AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,95993752,29.46,ERS311902,4.91217,-1.77397
AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,103044262,33.67,ERS311910,4.91217,-1.77397


In [4]:
codon_degeneracy = zarr.open_group("../outputs/build_blocks/codon_degeneracy.zarr.zip")[chrom][:]
codon_degeneracy

array([-1, -1, -1, ..., -1, -1, -1], dtype=int8)

##Generating allele counts

To do be able to filter, we need to have allele count data to actually filter

In [5]:
genome = phase2_ar1.genome_agamp3[chrom][:]
genome_a = np.zeros((len(genome),1), dtype = '|S1')
for i in range (0,len(genome)):
    genome_a[i]=genome[i]
genome = genome_a

#turning lowercase into uppercase
ref = genome[:,0].astype('U')
ref[ref=='a']='A'
ref[ref=='c']='C'
ref[ref=='g']='G'
ref[ref=='t']='T'
ref[ref=='n']='N'


#bringing together all the alternates
refv = ref[pos-1]
refv = np.reshape(refv, (-1,1))
alts_mat = np.concatenate((refv,alt),axis=1)

In [6]:
from copy import deepcopy

In [7]:
print(deepcopy)

<function deepcopy at 0x7f2890fe6d08>


In [8]:
def build_mc(pop):
    
    alts = deepcopy(alts_mat)
    
    ac = allel.AlleleCountsArray(allele_counts[chrom][pop])
    totinds = np.max(ac[:,0])
    #print(totinds)


    #making all values that don't map to an alternate base -1
    ac1 = ac.astype(int) 
    ac1[alts=='']=-1

    #resort!
    indices = np.flip(ac1.argsort(),axis = 1)
    row = np.reshape(np.array( np.linspace(0,len(ac)-1,len(ac)), dtype = int),(-1,1))
    table = np.concatenate((row,row,row,row),axis=1)
    atuple = (table,indices)
    altsa = alts[atuple]
    aca = ac1[atuple]

    #delete alleles that occur zero times
    altsa[aca==0] = ''
    aca[aca==0]=-1

    b = deepcopy(aca)
    b[aca==-1]=0
    b = np.sum(b,axis= 1)
    
    return altsa, aca, b, totinds

In [9]:
#seem to be missing GNcol, which on map has 1 female individual but here seems to have 4. either map mistake, different data, or couldn't identify three of the individuals by gender? seems improbable...
#p1a, p1b, p1c, t1 = build_mc('GM')
#p2a, p2b, p2c, t2 = build_mc('GW')
#p3a, p3b, p3c, t3 = build_mc('GNgam')
p4a, p4b, p4c, t4 = build_mc('BFgam')
p5a, p5b, p5c, t5 = build_mc('BFcol')
#p6a, p6b, p6c, t6 = build_mc('CIcol')
#p7a, p7b, p7c, t7 = build_mc('GHgam')
#p8a, p8b, p8c, t8 = build_mc('GHcol')
#p9a, p9b, p9c, t9 = build_mc('GQgam')
#p10a, p10b, p10c, t10 = build_mc('GAgam')
#p11a, p11b, p11c, t11 = build_mc('CMgam')
#p12a, p12b, p12c, t12 = build_mc('AOcol')
#p13a, p13b, p13c, t13 = build_mc('UGgam')
#p14a, p14b, p14c, t14 = build_mc('KE')
#p15a, p15b, p15c, t15 = build_mc('FRgam')


In [20]:
def build_extended_mc(altsa, aca, b, totinds):
    ext_ac = np.zeros((len(genome),4), dtype =int)
    ext_ac[:, 0] = totinds
    ext_ac[pos-1, 0] = aca[:,0]
    ext_ac[pos-1, 1] = aca[:,1]
    ext_ac[pos-1, 2] = aca[:,2]
    ext_ac[pos-1, 3] = aca[:,3]
    
    ext_alt = np.zeros((len(genome),4), dtype = 'U')
    ext_alt[:,0] = ref
    ext_alt[pos-1, 0] = altsa[:,0]
    ext_alt[pos-1, 1] = altsa[:,1]
    ext_alt[pos-1, 2] = altsa[:,2]
    ext_alt[pos-1, 3] = altsa[:,3]
    
    ext_b = np.ones((len(genome),1), dtype = int)*(totinds) 
    ext_b[pos-1] = np.reshape(b,(-1,1))
    
    return ext_ac, ext_alt.astype('U'), ext_b

In [17]:
print(t4.shape)

NameError: name 't4' is not defined

In [22]:
BFg_ac, BFg_alt, BFg_b = build_extended_mc(p4a, p4b, p4c, t4)
BFc_ac, BFc_alt, BFc_b = build_extended_mc(p5a, p5b, p5c, t5)
#KE_ac, KE_alt, KE_b = build_extended_mc(p14a, p14b, p14c, t14)

MemoryError: 

## Population subset

First, we will isolate a group of two populations from the data to study from Burkina Faso - BFGam and BFCol. To do this, we need to call on the sample metadata.

Firstly, lets define which populations we'll be using via the codes for them in the sample metadata, and count how many individuals we have filtered out at this stage.

In [35]:
pop1 = 'BFgam'
pop2 = 'BFcol'
n_samples_pop1 = np.count_nonzero(df_samples.population == pop1)
n_samples_pop2 = np.count_nonzero(df_samples.population == pop2)
print(pop1, n_samples_pop1, pop2, n_samples_pop2)

BFgam 92 BFcol 75


In [38]:
#pop = pop1 & pop2
#n_smaples_pop = np.count_nonzero(df_samples.population == pop)
#print(n_samples_pop)
#pop

In [36]:
subpops = {
    pop1: df_samples[df_samples.population == pop1].index,
    pop2: df_samples[df_samples.population == pop2].index,
}

In [42]:
acs = genotype_all.count_alleles_subpops(subpops)
acs

KeyboardInterrupt: 

## Filtering for degeneracy

Here is Emma's filter generating function. Currently in the process of adapting it

In [None]:
def new_filter_tot():
    #biallelic for all populations
    BFg_bia = (BFg_ac[:,2]<=0)
    BFc_bia = (BFc_ac[:,2]<=0)
    #KE_bia = (KE_ac[:,2]<=0)
    bia_tot = BFg_bia & BFc_bia #& KE_bia
    
    #All individuals present
    BFg_full = (BFg_b[:,0]==t4)
    BFc_full = (BFc_b[:,0]==t5)
    #KE_full = (KE_b[:,0]==t14)
    full_tot = BFg_full & BFc_full #& KE_full
                
    #Valid Outgroup Sites
    #sum_a = np.sum(a_ac, axis=1)>= 1
    #filter_a = sum_a & np.invert((a_ac[:,0]>0) & (a_alt[:,0]=='N'))
    #sum_q = np.sum(q_ac, axis=1)>= 1
    #filter_q = sum_q & np.invert((q_ac[:,0]>0) & (q_alt[:,0] =='N'))
    #sum_mer = np.sum(mer_ac, axis=1)>= 1
    #filter_mer = sum_mer & np.invert((mer_ac[:,0]>0) & (mer_alt[:,0] =='N'))                           
    #sum_mel = np.sum(mel_ac, axis=1)>= 1
    #filter_mel = sum_mel & np.invert((mel_ac[:,0]>0) & (mel_alt[:,0] =='N'))
    #filter_c = (c_alt[:,0]!='N')
    #filter_e = (e_alt[:,0]!='N')
    #outgroup_full = filter_a & filter_q & filter_mer & filter_mel & filter_c & filter_e
    
    #Accessibility 
    acs = phase2_ar1.accessibility[chrom]['is_accessible'][:]
    
    #Total Filter
    filter_new_tot = acs & bia_tot & full_tot #& outgroup_full
    
    #codon degeneracy - 0-fold and 4-fold filters
    fourfold = codon_degeneracy==4
    zerofold = codon_degeneracy==1
    filter_new_0 = filter_new_tot & zerofold 
    filter_new_4 = filter_new_tot & fourfold
    
    return filter_new_tot, filter_new_0, filter_new_4

In [None]:
fourfold = codon_degeneracy==4
fourfold