### Extract _Gammarus_ sp. haplotypes from metabarcoding data.

Import functions.

In [1]:
import metaBEAT_global_misc_functions as mb

Extract OTU ids for relevant species.

In [2]:
OTU_table = mb.load_BIOM('../2-metaBEAT/clip-trim-30_merge_forw-only_c0.97m3_blast_min0.85_GLOBAL-latest/filtered.biom')

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Gammarus_fossarum', level='species')

OTUs = {}
OTUs['Gammarus_fossarum'] = dictionary.keys()

dictionary = mb.find_target_OTUs_by_taxonomy(OTU_table, target='Gammarus_pulex', level='species')

OTUs['Gammarus_pulex'] = dictionary.keys()

print ""
print OTUs


OTUs_as_list = []
for sp in OTUs:
    OTUs_as_list.extend(OTUs[sp])

print "\nOTU list:"
print OTUs_as_list

SEARCH TERM: 'Gammarus_fossarum'
Found taxonomy metadata with OTUs - ok!
Screening at taxonomic level: 'species'


Identified 1 OTU(s) assigned to 'Gammarus_fossarum'.
SEARCH TERM: 'Gammarus_pulex'
Found taxonomy metadata with OTUs - ok!
Screening at taxonomic level: 'species'


Identified 1 OTU(s) assigned to 'Gammarus_pulex'.

{'Gammarus_pulex': [u'INV049|1_2107_17799_10892_1_ex'], 'Gammarus_fossarum': [u'INV010|1_2109_14978_12581_1_ex']}

OTU list:
[u'INV049|1_2107_17799_10892_1_ex', u'INV010|1_2109_14978_12581_1_ex']


Identify samples contributing to each of the OTUs.

In [3]:
per_OTU_samples={}

samples = OTU_table.ids(axis='sample')

for OTU in OTUs_as_list:
#    print OTU
    per_OTU_samples[OTU] = []
    obs = OTU_table.data(OTU, axis='observation')
    for i in range(len(obs)):
        if int(obs[i]) > 0:
#            print "\t%s" %samples[i]
            per_OTU_samples[OTU].append(samples[i])

for OTU in per_OTU_samples:
    print OTU,len(per_OTU_samples[OTU])

INV010|1_2109_14978_12581_1_ex 43
INV049|1_2107_17799_10892_1_ex 28


Identify centroids for the relevant OTUs.

In [4]:
per_OTU_centroids={}

for OTU in per_OTU_samples:
    print OTU,len(per_OTU_samples[OTU])
print "#######\n"

uc=open('../2-metaBEAT/clip-trim-30_merge_forw-only_c0.97m3_blast_min0.85_GLOBAL-latest/GLOBAL/global.uc', 'r')

for line in uc:
    if line.startswith('H'):
        if line.strip().split("\t")[9] in OTUs_as_list:
#            print "hit: %s\t%s" %(line.strip().split("\t")[9],line.strip().split("\t")[8])
            if not per_OTU_centroids.has_key(line.strip().split("\t")[9]):
                per_OTU_centroids[line.strip().split("\t")[9]]=[line.strip().split("\t")[9]]
                
            if line.strip().split("\t")[8].split("|")[0] in per_OTU_samples[line.strip().split("\t")[9]]:
                per_OTU_centroids[line.strip().split("\t")[9]].append(line.strip().split("\t")[8])

uc.close()

#print per_OTU_centroids

for OTU in per_OTU_centroids:
    print OTU,str(len(per_OTU_centroids[OTU]))
    for c in sorted(per_OTU_centroids[OTU]):
        print "\t"+c
#print per_OTU_centroids

INV010|1_2109_14978_12581_1_ex 43
INV049|1_2107_17799_10892_1_ex 28
#######

INV010|1_2109_14978_12581_1_ex 48
	BLANK-1|1_2101_9027_9556_1_ex
	INV005|1_1106_18377_11598_1_ex
	INV010|1_2109_14978_12581_1_ex
	INV027|1_1107_16209_9821_1_ex
	INV028|1_2114_10440_22302_1_ex
	INV029|1_1101_28409_12373_1_ex
	INV030|1_1106_14472_24032_1_ex
	INV031D|1_1103_3131_11859_1_ex
	INV033|1_1104_13337_5897_1_ex
	INV034D|1_1106_13703_15463_1_ex
	INV035|1_2110_28465_13888_1_ex
	INV036|1_1112_25865_20502_1_ex
	INV037|1_1105_10909_24517_1_ex
	INV038|1_1106_12622_16967_1_ex
	INV039|1_1102_27662_21209_1_ex
	INV040|1_1107_17072_25379_1_ex
	INV041|1_1104_21482_9292_1_ex
	INV041|1_2103_8694_15742_1_ex
	INV041|1_2112_26301_17798_1_ex
	INV042|1_2107_16725_11828_1_ex
	INV049|1_1103_24362_13935_1_ex
	INV053|1_2110_9028_9223_1_ex
	INV055|1_1103_14425_5743_1_ex
	INV056|1_1107_16230_7411_1_ex
	INV057|1_1101_11216_15815_1_ex
	INV059|1_1107_8049_11919_1_ex
	INV059|1_2114_14976_14354_1_ex
	INV062|1_1104_24665_13340_1_ex
	I

Specify a unique haplotype id for each OTU.

In [5]:
OTUs_synonyms = []
for sp in OTUs:
    count=0
#    OTUs_as_list.extend(OTUs[sp])
    
    for otu in OTUs[sp]:
        OTUs_synonyms.append(sp+'_'+str(count)+'_MB')
        count+=1
    
print OTUs_as_list
print OTUs_synonyms

[u'INV049|1_2107_17799_10892_1_ex', u'INV010|1_2109_14978_12581_1_ex']
['Gammarus_pulex_0_MB', 'Gammarus_fossarum_0_MB']


Identify samples contributing to each of the OTUs.

In [6]:
per_OTU_samples={}
            
for otu in per_OTU_centroids:
    per_OTU_samples[otu] = []
    for centroid in per_OTU_centroids[otu]:
        per_OTU_samples[otu].append(centroid.split("|")[0])
    

    per_OTU_samples[otu]=list(set(per_OTU_samples[otu]))
    print "\n"+otu,len(per_OTU_samples[otu]),sorted(per_OTU_samples[otu])



INV010|1_2109_14978_12581_1_ex 43 ['BLANK-1', 'INV005', 'INV010', 'INV027', 'INV028', 'INV029', 'INV030', 'INV031D', 'INV033', 'INV034D', 'INV035', 'INV036', 'INV037', 'INV038', 'INV039', 'INV040', 'INV041', 'INV042', 'INV049', 'INV053', 'INV055', 'INV056', 'INV057', 'INV059', 'INV062', 'INV063', 'SOI005', 'SOI029', 'SOI035', 'SOI037', 'SOI038', 'SOI039', 'SOI040', 'SOI057', 'SOI062', 'WAT029', 'WAT035', 'WAT036', 'WAT037', 'WAT038', 'WAT039', 'WAT040', 'WAT041']

INV049|1_2107_17799_10892_1_ex 28 ['BLANK-1', 'INV011', 'INV013', 'INV015', 'INV016', 'INV017', 'INV018', 'INV019', 'INV021', 'INV023', 'INV025', 'INV026', 'INV042', 'INV043', 'INV044', 'INV046', 'INV048', 'INV049', 'INV050', 'INV051', 'INV052', 'INV054', 'INV056', 'INV058', 'INV060', 'INV061', 'INV063', 'INV064']


Create global fasta file containing all reads for each OTU and also create separate fasta files containing all sequences contributing to the same OTU per sample.

In [7]:

from Bio import SeqIO

for i in range(len(OTUs_as_list)):
    otu=OTUs_as_list[i]
    syn=OTUs_synonyms[i]
    print "OTU: %s -> %s.fa" %(otu,syn)
    read_ids_per_OTU=[]
    seqs_per_OTU=[]
    for sample in per_OTU_samples[otu]:
        seqs_per_sample=[]
#        print "Sample: %s" %sample
        read_ids=[]
        #collect relevant centroids
        centroids=[]
        for c in per_OTU_centroids[otu]:
#            print c
            if c.startswith(sample+'|'):
                for c_ind in c.split("|")[1:]:
                    centroids.append(c_ind)
                
#        print "Cenroids: %s" %centroids
        
        #extract read ids from uc file
        read_ids.extend(centroids[:])
        uc=open('../2-metaBEAT/clip-trim-30_merge_forw-only_c0.97m3_blast_min0.85_GLOBAL-latest/'+sample+'/'+sample+'.uc', 'r')
        for line in uc:
            if line.startswith('H'):
                if line.strip().split("\t")[9] in centroids:
#                    print line
                    read_ids.append(line.strip().split("\t")[8])
        uc.close()            
#        print "READ IDS: %i" %len(read_ids)
        
        #extract reads and write to seqrec object
        
        fasta=open('../2-metaBEAT/clip-trim-30_merge_forw-only_c0.97m3_blast_min0.85_GLOBAL-latest/'+sample+'/'+sample+'_trimmed.fasta', 'r')
        for r in SeqIO.parse(fasta, 'fasta'):
            if r.id in read_ids:
                r.id = sample+'|'+r.id
                r.description = r.id
                seqs_per_sample.append(r)
                
            
        fasta.close()
        
        out=open(sample+'_'+syn+'.fasta', 'w')
        SeqIO.write(seqs_per_sample, out, 'fasta')
        out.close()
        
        seqs_per_OTU.extend(seqs_per_sample[:])
        
        
    #Write out global fasta per OTU, containing all reads across all samples
    out=open(syn+'.fa', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

OTU: INV049|1_2107_17799_10892_1_ex -> Gammarus_pulex_0_MB.fa
OTU: INV010|1_2109_14978_12581_1_ex -> Gammarus_fossarum_0_MB.fa


For each OTU cluster all reads at 100% identity per sample and extract the most abundant happlotype.

Identify most abundant haplotype per sample.

In [8]:
import glob
import os

for i in range(len(OTUs_synonyms)):
    print OTUs_synonyms[i]
    hts_per_OTU=[]
    seqs_per_OTU=[]
    for f in glob.glob('*_'+OTUs_synonyms[i]+".fasta"):
        print "\t"+f,
        #cluster at 100% similarity full length
        mb.vsearch_cluster_full_length(infile=f, cluster_match=float(1), threads=5, sampleID=f)

        hts_per_OTU.append(mb.find_most_abundant_seq_from_uc(uc=f+'.uc'))
        print " - longest: %s - cleanup" %hts_per_OTU[-1]
        os.remove(f+'.uc')
        os.remove(f+'_centroids.fasta')
        os.remove(f)

    print "extracting hts for %s -> %s" %(OTUs_synonyms[i],OTUs_synonyms[i]+'_hts.fasta')
    for r in SeqIO.parse(OTUs_synonyms[i]+'.fa', 'fasta'):
        if r.id in hts_per_OTU:
            seqs_per_OTU.append(r)
                
    fasta.close()
    print "final cleanup .. ",
    os.remove(OTUs_synonyms[i]+'.fa')
    print "DONE!\n"
        
    out=open(OTUs_synonyms[i]+'_hts.fasta', 'w')
    SeqIO.write(seqs_per_OTU, out, 'fasta')
    out.close()

Gammarus_pulex_0_MB
	INV017_Gammarus_pulex_0_MB.fasta  - longest: INV017|1_1101_7484_24995_1_ex - cleanup
	INV060_Gammarus_pulex_0_MB.fasta  - longest: INV060|1_1102_18269_3301_1_ex - cleanup
	INV061_Gammarus_pulex_0_MB.fasta  - longest: INV061|1_1107_22150_26420_1_ex - cleanup
	INV054_Gammarus_pulex_0_MB.fasta  - longest: INV054|1_1106_16595_20226_1_ex - cleanup
	INV026_Gammarus_pulex_0_MB.fasta  - longest: INV026|1_1112_20443_19711_1_ex - cleanup
	INV019_Gammarus_pulex_0_MB.fasta  - longest: INV019|1_1110_2257_17196_1_ex - cleanup
	INV056_Gammarus_pulex_0_MB.fasta  - longest: INV056|1_1101_18907_8328_1_ex - cleanup
	INV018_Gammarus_pulex_0_MB.fasta  - longest: INV018|1_1102_12328_25835_1_ex - cleanup
	INV015_Gammarus_pulex_0_MB.fasta  - longest: INV015|1_1102_12580_11576_1_ex - cleanup
	INV043_Gammarus_pulex_0_MB.fasta  - longest: INV043|1_1104_29375_14583_1_ex - cleanup
	INV023_Gammarus_pulex_0_MB.fasta  - longest: INV023|1_1114_5345_10453_1_ex - cleanup
	INV050_Gammarus_pulex_0_MB.

Cluster all haplotypes at 97 % to identify single representative sequence.

___G. fossarum___

In [9]:
!cat Gammarus_fossarum_* > Gammarus_fossarum_hts.fasta

In [10]:
mb.vsearch_cluster_full_length(infile='Gammarus_fossarum_hts.fasta', cluster_match=float(0.97), threads=5, sampleID='Gammarus_fossarum_hts')

___G. pulex___

In [11]:
!cat Gammarus_pulex_0_MB_hts.fasta > Gammarus_pulex_hts.fasta

In [12]:
mb.vsearch_cluster_full_length(infile='Gammarus_pulex_hts.fasta', cluster_match=float(0.97), threads=5, sampleID='Gammarus_pulex_hts')

Collapse haplotypes to single file and rename.

In [13]:
from Bio import SeqIO
import glob

seqs = []
for sp in OTUs:
    print sp
    for r in SeqIO.parse(open(sp+'_hts_centroids.fasta'), 'fasta'):
        r.description = sp+'|'+r.id
        r.id = r.description
        seqs.append(r)
        
out=open('Gammarus_sp_from_metaBEAT.fasta', 'w')
SeqIO.write(seqs, out, 'fasta')
out.close()

Gammarus_pulex
Gammarus_fossarum
