### Part 0: Import and Download NCBI Taxa Database

reference:http://etetoolkit.org/docs/latest/tutorial/tutorial_ncbitaxonomy.html

In [1]:
import numpy as np
import pandas as pd
from ete3 import NCBITaxa

In [2]:
# Download NCBI Taxa Database
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database()

### Part 1: Read Nanopore Data

In [50]:
#################################################
# Select raw data 
# (1) 'exit_status' == 'Classification successful'
# (2) 'barcode' == BC01, BC02, BC03, BC04

raw = pd.read_csv('./nanopore_preprocess_data/190807_classification_16s_barcode-v1.csv', encoding = 'utf-8')


#####
#BC01
#####
raw_sucess_BC01 = raw[(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC01')]

raw_sucess_BC01.to_csv('./190807result/raw_sucess_BC01.csv', encoding='utf-8')
print('BC01 read count:', len(raw_sucess_BC01))


#####
#BC02
#####
raw_sucess_BC02 = raw[(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC02')]

raw_sucess_BC02.to_csv('./190807result/raw_sucess_BC02.csv', encoding='utf-8')
print('BC02 read count:',len(raw_sucess_BC02))


#####
#BC03
#####
raw_sucess_BC03 = raw[(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC03')]

raw_sucess_BC03.to_csv('./190807result/raw_sucess_BC03.csv', encoding='utf-8')
print('BC03 read count:',len(raw_sucess_BC03))



#####
#BC04
#####
raw_sucess_BC04 = raw[(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC04')]

raw_sucess_BC04.to_csv('./190807result/raw_sucess_BC04.csv', encoding='utf-8')
print('BC04 read count:',len(raw_sucess_BC04))



BC01 read count: 73
BC02 read count: 138
BC03 read count: 30
BC04 read count: 6893


In [6]:
#####################
# BC01 groupby taxid

taxid_list_bc01 = pd.DataFrame(raw_sucess_BC01.groupby('taxid').count()['read_id'])

taxid_list_bc01.columns = ['read_count']

taxid_list_bc01.to_csv('./190807result/taxid_list_bc01.csv', encoding='utf-8')
taxid_list_bc01.head()



#####################
# BC02 groupby taxid

taxid_list_bc02 = pd.DataFrame(raw_sucess_BC02.groupby('taxid').count()['read_id'])

taxid_list_bc02.columns = ['read_count']

taxid_list_bc02.to_csv('./190807result/taxid_list_bc02.csv', encoding='utf-8')
taxid_list_bc02.head()


#####################
# BC03 groupby taxid

taxid_list_bc03 = pd.DataFrame(raw_sucess_BC03.groupby('taxid').count()['read_id'])

taxid_list_bc03.columns = ['read_count']

taxid_list_bc03.to_csv('./190807result/taxid_list_bc03.csv', encoding='utf-8')
taxid_list_bc03.head()



#####################
# BC03 groupby taxid

taxid_list_bc04 = pd.DataFrame(raw_sucess_BC04.groupby('taxid').count()['read_id'])

taxid_list_bc04.columns = ['read_count']

taxid_list_bc04.to_csv('./190807result/taxid_list_bc04.csv', encoding='utf-8')
taxid_list_bc04.head()

Unnamed: 0_level_0,read_count
taxid,Unnamed: 1_level_1
2,84
403,1
543,346
545,1
546,23


### Part 2: Analysis 16s

### No Selection in accuracy

In [51]:
##############
# BC01 lineage
##############

############
# get taxid 

taxid01 = list(taxid_list_bc01.index)
read_count01 = list(taxid_list_bc01['read_count'])
total_count01 = len(raw_sucess_BC01)


##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage01 = pd.DataFrame(columns= ['taxid'])
id_lineage01['taxid'] = taxid01
id_lineage01.head()

###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid01)):
    L = ncbi.get_lineage(taxid01[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage01.at[i, list(rank.values())[0]] = str(node)
        

id_lineage01.head()
      
    
    
##############
# BC02 lineage
##############

############
# get taxid 

taxid02 = list(taxid_list_bc02.index)
read_count02 = list(taxid_list_bc02['read_count'])
total_count02 = len(raw_sucess_BC02)


##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage02 = pd.DataFrame(columns= ['taxid'])
id_lineage02['taxid'] = taxid02
id_lineage02.head()

###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid02)):
    L = ncbi.get_lineage(taxid02[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage02.at[i, list(rank.values())[0]] = str(node)
        

id_lineage02.head()




##############
# BC03 lineage
##############

############
# get taxid 

taxid03 = list(taxid_list_bc03.index)
read_count03 = list(taxid_list_bc03['read_count'])
total_count03 = len(raw_sucess_BC03)


##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage03 = pd.DataFrame(columns= ['taxid'])
id_lineage03['taxid'] = taxid03
id_lineage03.head()

###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid03)):
    L = ncbi.get_lineage(taxid03[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage03.at[i, list(rank.values())[0]] = str(node)
        

id_lineage03.head()



##############
# BC04 lineage
##############

############
# get taxid 

taxid04 = list(taxid_list_bc04.index)
read_count04 = list(taxid_list_bc04['read_count'])
total_count04 = len(raw_sucess_BC04)


##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage04 = pd.DataFrame(columns= ['taxid'])
id_lineage04['taxid'] = taxid04
id_lineage04.head()

###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid04)):
    L = ncbi.get_lineage(taxid04[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage04.at[i, list(rank.values())[0]] = str(node)
        

id_lineage04.head()




Unnamed: 0,taxid,no rank,superkingdom,phylum,class,order,family,genus,species,species group,subspecies,subphylum
0,2,131567,2,,,,,,,,,
1,403,131567,2,1224.0,1236.0,135618.0,403.0,,,,,
2,543,131567,2,1224.0,1236.0,91347.0,543.0,,,,,
3,545,131567,2,1224.0,1236.0,91347.0,543.0,544.0,545.0,,,
4,546,131567,2,1224.0,1236.0,91347.0,543.0,544.0,546.0,1344959.0,,


In [29]:
###############################
# BC01 superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid01, id_lineage01['superkingdom'], read_count01)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )


count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count01)*100
count_superkingdom.to_csv('./190807result/BC01_count_superkingdom_noSelect.csv', encoding='utf-8')
count_superkingdom




###############################
# BC02 superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid02, id_lineage02['superkingdom'], read_count02)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )


count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count02)*100
count_superkingdom.to_csv('./190807result/BC02_count_superkingdom_noSelect.csv', encoding='utf-8')
count_superkingdom


###############################
# BC03 superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid03, id_lineage03['superkingdom'], read_count03)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )


count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count03)*100
count_superkingdom.to_csv('./190807result/BC03_count_superkingdom_noSelect.csv', encoding='utf-8')
count_superkingdom


###############################
# BC04 superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid04, id_lineage04['superkingdom'], read_count04)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )


count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count04)*100
count_superkingdom.to_csv('./190807result/BC04_count_superkingdom_noSelect.csv', encoding='utf-8')
count_superkingdom

Unnamed: 0_level_0,name,read_count,percentage
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Bacteria,6892,99.985493


In [33]:
#########################
# BC01 phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid01, id_lineage01['phylum'], read_count01)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count01)*100
count_phylum.to_csv('./190807result/BC01_count_phylum_noSelect.csv', encoding='utf-8')
count_phylum


#########################
# BC02 phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid02, id_lineage02['phylum'], read_count02)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count02)*100
count_phylum.to_csv('./190807result/BC02_count_phylum_noSelect.csv', encoding='utf-8')
count_phylum


#########################
# BC03 phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid03, id_lineage03['phylum'], read_count03)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count03)*100
count_phylum.to_csv('./190807result/BC03_count_phylum_noSelect.csv', encoding='utf-8')
count_phylum


#########################
# BC04 phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid04, id_lineage04['phylum'], read_count04)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count04)*100
count_phylum.to_csv('./190807result/BC04_count_phylum_noSelect.csv', encoding='utf-8')
count_phylum


Unnamed: 0_level_0,name,read_count,percentage
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
976,Bacteroidetes,3549,51.487016
1239,Firmicutes,1357,19.686639
1224,Proteobacteria,985,14.289859
32066,Fusobacteria,533,7.732482
74201,Verrucomicrobia,369,5.353257
201174,Actinobacteria,1,0.014507


In [38]:
########################
# BC01 class: Read count 

class_ = pd.DataFrame(list(zip(taxid01, id_lineage01['class'], read_count01)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count01)*100
count_class.to_csv('./190807result/BC01_count_class_noSelect.csv', encoding='utf-8')
count_class



########################
# BC02 class: Read count 

class_ = pd.DataFrame(list(zip(taxid02, id_lineage02['class'], read_count02)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count02)*100
count_class.to_csv('./190807result/BC02_count_class_noSelect.csv', encoding='utf-8')
count_class


########################
# BC03 class: Read count 

class_ = pd.DataFrame(list(zip(taxid03, id_lineage03['class'], read_count03)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count03)*100
count_class.to_csv('./190807result/BC03_count_class_noSelect.csv', encoding='utf-8')
count_class


########################
# BC04 class: Read count 

class_ = pd.DataFrame(list(zip(taxid04, id_lineage04['class'], read_count04)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count04)*100
count_class.to_csv('./190807result/BC04_count_class_noSelect.csv', encoding='utf-8')
count_class

Unnamed: 0_level_0,name,read_count,percentage
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200643,Bacteroidia,3532,51.240389
1236,Gammaproteobacteria,855,12.403888
186801,Clostridia,722,10.474394
909932,Negativicutes,582,8.443348
203490,Fusobacteriia,533,7.732482
203494,Verrucomicrobiae,367,5.324242
28216,Betaproteobacteria,122,1.769912
526524,Erysipelotrichia,7,0.101552
91061,Bacilli,6,0.087045
28221,Deltaproteobacteria,5,0.072537


In [42]:
########################
# BC01 order: Read count 

order = pd.DataFrame(list(zip(taxid01, id_lineage01['order'], read_count01)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count01)*100
count_order.to_csv('./190807result/BC01_count_order_noSelect.csv', encoding='utf-8')
count_order



########################
# BC02 order: Read count 

order = pd.DataFrame(list(zip(taxid02, id_lineage02['order'], read_count02)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count02)*100
count_order.to_csv('./190807result/BC02_count_order_noSelect.csv', encoding='utf-8')
count_order


########################
# BC03 order: Read count 

order = pd.DataFrame(list(zip(taxid03, id_lineage03['order'], read_count03)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count03)*100
count_order.to_csv('./190807result/BC03_count_order_noSelect.csv', encoding='utf-8')
count_order


########################
# BC04 order: Read count 

order = pd.DataFrame(list(zip(taxid04, id_lineage04['order'], read_count04)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count04)*100
count_order.to_csv('./190807result/BC04_count_order_noSelect.csv', encoding='utf-8')
count_order

Unnamed: 0_level_0,name,read_count,percentage
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171549,Bacteroidales,3528,51.182359
91347,Enterobacterales,821,11.910634
186802,Clostridiales,722,10.474394
203491,Fusobacteriales,533,7.732482
1843488,Acidaminococcales,390,5.657914
48461,Verrucomicrobiales,367,5.324242
80840,Burkholderiales,119,1.726389
526525,Erysipelotrichales,7,0.101552
213115,Desulfovibrionales,5,0.072537
186826,Lactobacillales,5,0.072537


In [44]:
#########################
# BC01 family: Read count 

family = pd.DataFrame(list(zip(taxid01, id_lineage01['family'], read_count01)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count01)*100
count_family.to_csv('./190807result/BC01_count_family_noSelect.csv', encoding='utf-8')
count_family


#########################
# BC02 family: Read count 

family = pd.DataFrame(list(zip(taxid02, id_lineage02['family'], read_count02)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count02)*100
count_family.to_csv('./190807result/BC02_count_family_noSelect.csv', encoding='utf-8')
count_family


#########################
# BC03 family: Read count 

family = pd.DataFrame(list(zip(taxid03, id_lineage03['family'], read_count03)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count03)*100
count_family.to_csv('./190807result/BC03_count_family_noSelect.csv', encoding='utf-8')
count_family


#########################
# BC04 family: Read count 

family = pd.DataFrame(list(zip(taxid04, id_lineage04['family'], read_count04)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count04)*100
count_family.to_csv('./190807result/BC04_count_family_noSelect.csv', encoding='utf-8')
count_family

Unnamed: 0_level_0,name,read_count,percentage
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
815,Bacteroidaceae,2427,35.209633
171552,Prevotellaceae,808,11.722037
543,Enterobacteriaceae,645,9.357319
186803,Lachnospiraceae,578,8.385318
203492,Fusobacteriaceae,533,7.732482
909930,Acidaminococcaceae,390,5.657914
1647988,Akkermansiaceae,338,4.903525
2005525,Tannerellaceae,179,2.596837
995019,Sutterellaceae,95,1.37821
541000,Ruminococcaceae,58,0.841433


In [46]:
########################
# BC01 genus: Read count 

genus = pd.DataFrame(list(zip(taxid01, id_lineage01['genus'], read_count01)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count01)*100
count_genus.to_csv('./190807result/BC01_count_genus_noSelect.csv', encoding='utf-8')
count_genus


########################
# BC02 genus: Read count 

genus = pd.DataFrame(list(zip(taxid02, id_lineage02['genus'], read_count02)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count02)*100
count_genus.to_csv('./190807result/BC02_count_genus_noSelect.csv', encoding='utf-8')
count_genus


########################
# BC03 genus: Read count 

genus = pd.DataFrame(list(zip(taxid03, id_lineage03['genus'], read_count03)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count03)*100
count_genus.to_csv('./190807result/BC03_count_genus_noSelect.csv', encoding='utf-8')
count_genus


########################
# BC04 genus: Read count 

genus = pd.DataFrame(list(zip(taxid04, id_lineage04['genus'], read_count04)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count04)*100
count_genus.to_csv('./190807result/BC04_count_genus_noSelect.csv', encoding='utf-8')
count_genus

Unnamed: 0_level_0,name,read_count,percentage
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816,Bacteroides,2415,35.035543
838,Prevotella,683,9.908603
848,Fusobacterium,524,7.601915
1506553,Lachnoclostridium,407,5.904541
239934,Akkermansia,338,4.903525
570,Klebsiella,182,2.640360
375288,Parabacteroides,176,2.553315
40544,Sutterella,73,1.059045
2039240,Anaerotignum,70,1.015523
544,Citrobacter,47,0.681851


In [48]:
##########################
# BC01 species: Read count 

species = pd.DataFrame(list(zip(taxid01, id_lineage01['species'], read_count01)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count01)*100
count_species.to_csv('./190807result/BC01_count_species_noSelect.csv', encoding='utf-8')
count_species


##########################
# BC02 species: Read count 

species = pd.DataFrame(list(zip(taxid02, id_lineage02['species'], read_count02)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count02)*100
count_species.to_csv('./190807result/BC02_count_species_noSelect.csv', encoding='utf-8')
count_species


##########################
# BC03 species: Read count 

species = pd.DataFrame(list(zip(taxid03, id_lineage03['species'], read_count03)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count03)*100
count_species.to_csv('./190807result/BC03_count_species_noSelect.csv', encoding='utf-8')
count_species


##########################
# BC04 species: Read count 

species = pd.DataFrame(list(zip(taxid04, id_lineage04['species'], read_count04)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count04)*100
count_species.to_csv('./190807result/BC04_count_species_noSelect.csv', encoding='utf-8')
count_species


Unnamed: 0_level_0,name,read_count,percentage
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
310297,Bacteroides plebeius,854,12.389381
363265,Prevotella stercorea,677,9.821558
821,Bacteroides vulgatus,504,7.311766
856,Fusobacterium varium,486,7.050631
46506,Bacteroides stercoris,443,6.426810
239935,Akkermansia muciniphila,335,4.860003
208479,[Clostridium] bolteae,278,4.033077
47678,Bacteroides caccae,200,2.901494
573,Klebsiella pneumoniae,152,2.205136
823,Parabacteroides distasonis,109,1.581314
