### Part 0: Import and Download NCBI Taxa Database

reference:http://etetoolkit.org/docs/latest/tutorial/tutorial_ncbitaxonomy.html

In [234]:
import numpy as np
import pandas as pd
from ete3 import NCBITaxa

In [235]:
# Download NCBI Taxa Database
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

### Part 1: Read Nanopore Data

In [236]:
#################################################
# Select raw data 
# (1) 'exit_status' == 'Classification successful'
# (2) 'barcode' == 'BC04'

raw = pd.read_csv('./nanopore_preprocess_data/207946_classification_16s_barcode-v1.csv', encoding = 'utf-8')

#raw_sucess_BC04 = pd.DataFrame([(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC04')])

raw_sucess_BC04 = raw[(raw['exit_status'] == 'Classification successful') & (raw['barcode'] == 'BC04')]

raw_sucess_BC04 = raw_sucess_BC04.sort_values(by=['accuracy'], ascending=False)

#raw_sucess_BC04.to_csv('raw_sucess_BC04.csv', encoding='utf-8')
print(len(raw_sucess_BC04))



736862


In [237]:
################################################
# Data with >90% accuray of taxid and read_count

data_accuracy90 = raw_sucess_BC04[raw_sucess_BC04['accuracy'] >= 90]
print(len(data_accuracy90))

taxid_list90 = pd.DataFrame(data_accuracy90.groupby('taxid').count()['read_id'])

taxid_list90.columns = ['read_count']

taxid_list90.to_csv('./accuracy_read/taxid_list90.csv', encoding='utf-8')
taxid_list90.head()


195245


Unnamed: 0_level_0,read_count
taxid,Unnamed: 1_level_1
2,410
356,1
403,7
543,8596
545,40


In [238]:
################################################
# Data with >85% accuray of taxid and read_count

data_accuracy85 = raw_sucess_BC04[raw_sucess_BC04['accuracy'] >= 85]
print(len(data_accuracy85))

taxid_list85 = pd.DataFrame(data_accuracy85.groupby('taxid').count()['read_id'])

taxid_list85.columns = ['read_count']

taxid_list85.to_csv('./accuracy_read/taxid_list85.csv', encoding='utf-8')
taxid_list85.head()

583049


Unnamed: 0_level_0,read_count
taxid,Unnamed: 1_level_1
2,2214
356,3
403,9
471,1
543,25821


In [239]:
################################################
# Data with >80% accuray of taxid and read_count

data_accuracy80 = raw_sucess_BC04[raw_sucess_BC04['accuracy'] >= 80]
print(len(data_accuracy80))

taxid_list80 = pd.DataFrame(data_accuracy80.groupby('taxid').count()['read_id'])

taxid_list80.columns = ['read_count']

taxid_list80.to_csv('./accuracy_read/taxid_list80.csv', encoding='utf-8')
taxid_list80.head()


725907


Unnamed: 0_level_0,read_count
taxid,Unnamed: 1_level_1
2,5756
89,1
126,1
154,1
292,1


### Part 2: Analysis 16s

### No Selection in accuracy

In [240]:
data = pd.read_csv('./nanopore_preprocess_data/taxid_list.csv', encoding = 'utf-8')
data.head()

taxid = list(data['taxids'])
read_count = list(data['Freq'])
total_count = len(raw_sucess_BC04)

###############
# taxid to name

name = ncbi.get_taxid_translator(taxid)


###############
# taxid to rank
rank = ncbi.get_rank([taxid[0]])



In [241]:
##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage = pd.DataFrame(columns= ['taxid'])
id_lineage['taxid'] = taxid
id_lineage.head()

Unnamed: 0,taxid
0,2
1,89
2,126
3,154
4,292


In [242]:
###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid)):
    L = ncbi.get_lineage(taxid[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage.at[i, list(rank.values())[0]] = str(node)
        
        



In [243]:
id_lineage.head(10)

Unnamed: 0,taxid,no rank,superkingdom,phylum,class,order,genus,species,family,species group,subspecies,subphylum,species subgroup,suborder
0,2,131567,2,,,,,,,,,,,
1,89,224471,2,1224.0,28216.0,80840.0,88.0,89.0,,,,,,
2,126,1783257,2,203682.0,203683.0,112.0,,,126.0,,,,,
3,154,131567,2,203691.0,203692.0,136.0,146.0,154.0,137.0,,,,,
4,292,131567,2,1224.0,28216.0,80840.0,32008.0,292.0,119060.0,87882.0,,,,
5,304,131567,2,1224.0,28216.0,80840.0,335058.0,304.0,80864.0,,,,,
6,337,131567,2,1224.0,28216.0,80840.0,32008.0,337.0,119060.0,,,,,
7,356,131567,2,1224.0,28211.0,356.0,,,,,,,,
8,403,131567,2,1224.0,1236.0,135618.0,,,403.0,,,,,
9,471,131567,2,1224.0,1236.0,72274.0,469.0,471.0,468.0,909768.0,,,,


In [244]:
##########################
# superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid, id_lineage['superkingdom'], read_count)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )


count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count)*100
count_superkingdom.to_csv('./result/count_superkingdom_noSelect.csv', encoding='utf-8')
count_superkingdom


Unnamed: 0_level_0,name,read_count,percentage
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Bacteria,736859,99.999593


In [245]:
####################
# phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid, id_lineage['phylum'], read_count)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count)*100
count_phylum.to_csv('./result/count_phylum_noSelect.csv', encoding='utf-8')
count_phylum


Unnamed: 0_level_0,name,read_count,percentage
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
976,Bacteroidetes,391769,53.167214
1239,Firmicutes,136609,18.539292
1224,Proteobacteria,93084,12.632487
74201,Verrucomicrobia,54557,7.403964
32066,Fusobacteria,54427,7.386322
201174,Actinobacteria,24,0.003257
1117,Cyanobacteria,2,0.000271
203691,Spirochaetes,2,0.000271
544448,Tenericutes,2,0.000271
203682,Planctomycetes,1,0.000136


In [246]:
####################
# class: Read count 

class_ = pd.DataFrame(list(zip(taxid, id_lineage['class'], read_count)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count)*100
count_class.to_csv('./result/count_class_noSelect.csv', encoding='utf-8')
count_class


Unnamed: 0_level_0,name,read_count,percentage
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200643,Bacteroidia,391545,53.136815
1236,Gammaproteobacteria,78028,10.589228
909932,Negativicutes,72178,9.795321
186801,Clostridia,61881,8.397909
203494,Verrucomicrobiae,54535,7.400979
203490,Fusobacteriia,54427,7.386322
28216,Betaproteobacteria,14355,1.948126
91061,Bacilli,1040,0.141139
28221,Deltaproteobacteria,587,0.079662
526524,Erysipelotrichia,228,0.030942


In [247]:
####################
# order: Read count 

order = pd.DataFrame(list(zip(taxid, id_lineage['order'], read_count)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count)*100
count_order.to_csv('./result/count_order_noSelect.csv', encoding='utf-8')
count_order


Unnamed: 0_level_0,name,read_count,percentage
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171549,Bacteroidales,391461,53.125416
91347,Enterobacterales,75774,10.283337
186802,Clostridiales,61873,8.396823
48461,Verrucomicrobiales,54535,7.400979
203491,Fusobacteriales,54427,7.386322
1843488,Acidaminococcales,46896,6.364285
80840,Burkholderiales,14064,1.908634
186826,Lactobacillales,990,0.134354
213115,Desulfovibrionales,580,0.078712
135624,Aeromonadales,230,0.031213


In [248]:
####################
# family: Read count 

family = pd.DataFrame(list(zip(taxid, id_lineage['family'], read_count)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count)*100
count_family.to_csv('./result/count_family_noSelect.csv', encoding='utf-8')
count_family

Unnamed: 0_level_0,name,read_count,percentage
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
815,Bacteroidaceae,280023,38.002095
171552,Prevotellaceae,82791,11.235618
203492,Fusobacteriaceae,54421,7.385508
1647988,Akkermansiaceae,53798,7.300960
543,Enterobacteriaceae,53722,7.290646
186803,Lachnospiraceae,49422,6.707090
909930,Acidaminococcaceae,46896,6.364285
2005525,Tannerellaceae,18954,2.572259
995019,Sutterellaceae,11953,1.622149
541000,Ruminococcaceae,5836,0.792007


In [249]:
####################
# genus: Read count 

genus = pd.DataFrame(list(zip(taxid, id_lineage['genus'], read_count)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count)*100
count_genus.to_csv('./result/count_genus_noSelect.csv', encoding='utf-8')
count_genus



Unnamed: 0_level_0,name,read_count,percentage
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816,Bacteroides,279409,37.918769
838,Prevotella,72074,9.781207
848,Fusobacterium,54050,7.335159
239934,Akkermansia,53798,7.300960
1506553,Lachnoclostridium,32599,4.424031
375288,Parabacteroides,18682,2.535346
570,Klebsiella,13141,1.783373
40544,Sutterella,9562,1.297665
2039240,Anaerotignum,7156,0.971145
544,Citrobacter,4227,0.573649


In [250]:
####################
# species: Read count 

species = pd.DataFrame(list(zip(taxid, id_lineage['species'], read_count)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count)*100
count_species.to_csv('./result/count_species_noSelect.csv', encoding='utf-8')
count_species



Unnamed: 0_level_0,name,read_count,percentage
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
310297,Bacteroides plebeius,105613,14.332806
363265,Prevotella stercorea,71400,9.689738
821,Bacteroides vulgatus,55928,7.590024
46506,Bacteroides stercoris,55393,7.517418
239935,Akkermansia muciniphila,53781,7.298653
856,Fusobacterium varium,45323,6.150812
47678,Bacteroides caccae,25110,3.407694
208479,[Clostridium] bolteae,23250,3.155272
573,Klebsiella pneumoniae,11649,1.580893
823,Parabacteroides distasonis,11397,1.546694


### Accuracy >= 90%

In [251]:
data = pd.read_csv('./accuracy_read/taxid_list90.csv', encoding = 'utf-8')

data.head()

Unnamed: 0,taxid,read_count
0,2,410
1,356,1
2,403,7
3,543,8596
4,545,40


In [252]:
taxid = list(data['taxid'])
read_count = list(data['read_count'])

total_count = len(data_accuracy90)

###############
# taxid to name

name = ncbi.get_taxid_translator(taxid)


###############
# taxid to rank
rank = ncbi.get_rank([taxid[0]])

In [253]:
##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage = pd.DataFrame(columns= ['taxid'])
id_lineage['taxid'] = taxid
id_lineage.head()

Unnamed: 0,taxid
0,2
1,356
2,403
3,543
4,545


In [254]:
###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid)):
    L = ncbi.get_lineage(taxid[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage.at[i, list(rank.values())[0]] = str(node)

In [255]:
id_lineage.head(10)

Unnamed: 0,taxid,no rank,superkingdom,phylum,class,order,family,genus,species,species group,subspecies,subphylum
0,2,131567,2,,,,,,,,,
1,356,131567,2,1224.0,28211.0,356.0,,,,,,
2,403,131567,2,1224.0,1236.0,135618.0,403.0,,,,,
3,543,131567,2,1224.0,1236.0,91347.0,543.0,,,,,
4,545,131567,2,1224.0,1236.0,91347.0,543.0,544.0,545.0,,,
5,546,131567,2,1224.0,1236.0,91347.0,543.0,544.0,546.0,1344959.0,,
6,548,131567,2,1224.0,1236.0,91347.0,543.0,570.0,548.0,,,
7,549,131567,2,1224.0,1236.0,91347.0,1903409.0,53335.0,549.0,1654067.0,,
8,550,131567,2,1224.0,1236.0,91347.0,543.0,547.0,550.0,354276.0,,
9,552,131567,2,1224.0,1236.0,91347.0,1903409.0,551.0,552.0,,,


In [256]:
##########################
# superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid, id_lineage['superkingdom'], read_count)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )

count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count)*100
count_superkingdom.to_csv('./result/count_superkingdom_accuracy90.csv', encoding='utf-8')
count_superkingdom


Unnamed: 0_level_0,name,read_count,percentage
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Bacteria,195245,100.0


In [257]:
####################
# phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid, id_lineage['phylum'], read_count)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count)*100
count_phylum.to_csv('./result/count_phylum_accuracy90.csv', encoding='utf-8')
count_phylum


Unnamed: 0_level_0,name,read_count,percentage
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
976,Bacteroidetes,101449,51.959845
1239,Firmicutes,34207,17.520039
1224,Proteobacteria,23117,11.839996
32066,Fusobacteria,18702,9.578734
74201,Verrucomicrobia,17333,8.877564
201174,Actinobacteria,11,0.005634
544448,Tenericutes,1,0.000512
65842,Fibrobacteres,1,0.000512


In [258]:
####################
# class: Read count 

class_ = pd.DataFrame(list(zip(taxid, id_lineage['class'], read_count)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count)*100
count_class.to_csv('./result/count_class_accuracy90.csv', encoding='utf-8')
count_class


Unnamed: 0_level_0,name,read_count,percentage
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200643,Bacteroidia,101407,51.938334
909932,Negativicutes,20500,10.499629
1236,Gammaproteobacteria,19515,9.995134
203490,Fusobacteriia,18702,9.578734
203494,Verrucomicrobiae,17330,8.876028
186801,Clostridia,13207,6.764322
28216,Betaproteobacteria,3582,1.834618
91061,Bacilli,291,0.149044
526524,Erysipelotrichia,59,0.030218
117747,Sphingobacteriia,11,0.005634


In [259]:
####################
# order: Read count 

order = pd.DataFrame(list(zip(taxid, id_lineage['order'], read_count)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count)*100
count_order.to_csv('./result/count_order_accuracy90.csv', encoding='utf-8')
count_order

Unnamed: 0_level_0,name,read_count,percentage
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171549,Bacteroidales,101387,51.92809
91347,Enterobacterales,19306,9.888089
203491,Fusobacteriales,18702,9.578734
48461,Verrucomicrobiales,17330,8.876028
1843488,Acidaminococcales,13268,6.795565
186802,Clostridiales,13205,6.763297
80840,Burkholderiales,3573,1.830008
186826,Lactobacillales,281,0.143922
526525,Erysipelotrichales,59,0.030218
135624,Aeromonadales,49,0.025097


In [260]:
####################
# family: Read count 

family = pd.DataFrame(list(zip(taxid, id_lineage['family'], read_count)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count)*100
count_family.to_csv('./result/count_family_accracy90.csv', encoding='utf-8')
count_family

Unnamed: 0_level_0,name,read_count,percentage
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
815,Bacteroidaceae,72176,36.966888
171552,Prevotellaceae,23511,12.041794
203492,Fusobacteriaceae,18700,9.577710
1647988,Akkermansiaceae,17147,8.782299
543,Enterobacteriaceae,13486,6.907219
909930,Acidaminococcaceae,13268,6.795565
186803,Lachnospiraceae,11220,5.746626
2005525,Tannerellaceae,4620,2.366258
995019,Sutterellaceae,3339,1.710159
541000,Ruminococcaceae,1185,0.606930


In [261]:
####################
# genus: Read count 

genus = pd.DataFrame(list(zip(taxid, id_lineage['genus'], read_count)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count)*100
count_genus.to_csv('./result/count_genus_accuracy90.csv', encoding='utf-8')
count_genus

Unnamed: 0_level_0,name,read_count,percentage
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816,Bacteroides,72101,36.928474
838,Prevotella,21442,10.982099
848,Fusobacterium,18609,9.531102
239934,Akkermansia,17147,8.782299
1506553,Lachnoclostridium,7465,3.823401
375288,Parabacteroides,4468,2.288407
570,Klebsiella,3145,1.610797
40544,Sutterella,3036,1.554969
2039240,Anaerotignum,2075,1.062767
544,Citrobacter,882,0.451740


In [262]:
####################
# species: Read count 

species = pd.DataFrame(list(zip(taxid, id_lineage['species'], read_count)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count)*100
count_species.to_csv('./result/count_species_accuracy90.csv', encoding='utf-8')
count_species

Unnamed: 0_level_0,name,read_count,percentage
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
310297,Bacteroides plebeius,25324,12.970371
363265,Prevotella stercorea,21267,10.892468
821,Bacteroides vulgatus,18203,9.323158
239935,Akkermansia muciniphila,17144,8.780763
856,Fusobacterium varium,16212,8.303414
46506,Bacteroides stercoris,11683,5.983764
47678,Bacteroides caccae,7274,3.725576
208479,[Clostridium] bolteae,5944,3.044380
40545,Sutterella wadsworthensis,2995,1.533970
573,Klebsiella pneumoniae,2840,1.454583


### Accuracy >= 85%

In [263]:
data = pd.read_csv('./accuracy_read/taxid_list85.csv', encoding = 'utf-8')

data.head()

Unnamed: 0,taxid,read_count
0,2,2214
1,356,3
2,403,9
3,471,1
4,543,25821


In [264]:
taxid = list(data['taxid'])
read_count = list(data['read_count'])

total_count = len(data_accuracy85)

###############
# taxid to name

name = ncbi.get_taxid_translator(taxid)


###############
# taxid to rank
rank = ncbi.get_rank([taxid[0]])

In [265]:
##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage = pd.DataFrame(columns= ['taxid'])
id_lineage['taxid'] = taxid
id_lineage.head()

Unnamed: 0,taxid
0,2
1,356
2,403
3,471
4,543


In [266]:
###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid)):
    L = ncbi.get_lineage(taxid[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage.at[i, list(rank.values())[0]] = str(node)

In [267]:
id_lineage.head(10)

Unnamed: 0,taxid,no rank,superkingdom,phylum,class,order,family,genus,species group,species,subspecies,subphylum
0,2,131567,2,,,,,,,,,
1,356,131567,2,1224.0,28211.0,356.0,,,,,,
2,403,131567,2,1224.0,1236.0,135618.0,403.0,,,,,
3,471,131567,2,1224.0,1236.0,72274.0,468.0,469.0,909768.0,471.0,,
4,543,131567,2,1224.0,1236.0,91347.0,543.0,,,,,
5,545,131567,2,1224.0,1236.0,91347.0,543.0,544.0,,545.0,,
6,546,131567,2,1224.0,1236.0,91347.0,543.0,544.0,1344959.0,546.0,,
7,548,131567,2,1224.0,1236.0,91347.0,543.0,570.0,,548.0,,
8,549,131567,2,1224.0,1236.0,91347.0,1903409.0,53335.0,1654067.0,549.0,,
9,550,131567,2,1224.0,1236.0,91347.0,543.0,547.0,354276.0,550.0,,


In [268]:
##########################
# superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid, id_lineage['superkingdom'], read_count)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )

count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count)*100
count_superkingdom.to_csv('./result/count_superkingdom_accuracy85.csv', encoding='utf-8')
count_superkingdom


Unnamed: 0_level_0,name,read_count,percentage
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Bacteria,583048,99.999828


In [269]:
####################
# phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid, id_lineage['phylum'], read_count)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count)*100
count_phylum.to_csv('./result/count_phylum_accuracy85.csv', encoding='utf-8')
count_phylum

Unnamed: 0_level_0,name,read_count,percentage
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
976,Bacteroidetes,310901,53.323306
1239,Firmicutes,106566,18.277366
1224,Proteobacteria,72089,12.364141
74201,Verrucomicrobia,45623,7.8249
32066,Fusobacteria,45596,7.820269
201174,Actinobacteria,17,0.002916
203691,Spirochaetes,1,0.000172
544448,Tenericutes,1,0.000172
65842,Fibrobacteres,1,0.000172


In [270]:
####################
# class: Read count 

class_ = pd.DataFrame(list(zip(taxid, id_lineage['class'], read_count)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count)*100
count_class.to_csv('./result/count_class_accuracy85.csv', encoding='utf-8')
count_class

Unnamed: 0_level_0,name,read_count,percentage
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200643,Bacteroidia,310783,53.303067
1236,Gammaproteobacteria,61395,10.52999
909932,Negativicutes,59285,10.168099
186801,Clostridia,45710,7.839821
203494,Verrucomicrobiae,45611,7.822842
203490,Fusobacteriia,45596,7.820269
28216,Betaproteobacteria,10549,1.809282
91061,Bacilli,815,0.139782
526524,Erysipelotrichia,181,0.031044
28221,Deltaproteobacteria,94,0.016122


In [271]:
####################
# order: Read count 

order = pd.DataFrame(list(zip(taxid, id_lineage['order'], read_count)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count)*100
count_order.to_csv('./result/count_order_accuracy85.csv', encoding='utf-8')
count_order

Unnamed: 0_level_0,name,read_count,percentage
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171549,Bacteroidales,310734,53.294663
91347,Enterobacterales,59992,10.289358
186802,Clostridiales,45706,7.839135
48461,Verrucomicrobiales,45611,7.822842
203491,Fusobacteriales,45596,7.820269
1843488,Acidaminococcales,38227,6.556396
80840,Burkholderiales,10446,1.791616
186826,Lactobacillales,786,0.134809
526525,Erysipelotrichales,181,0.031044
135624,Aeromonadales,175,0.030015


In [272]:
####################
# family: Read count 

family = pd.DataFrame(list(zip(taxid, id_lineage['family'], read_count)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count)*100
count_family.to_csv('./result/count_family_accracy85.csv', encoding='utf-8')
count_family

Unnamed: 0_level_0,name,read_count,percentage
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
815,Bacteroidaceae,222907,38.231264
171552,Prevotellaceae,67644,11.601769
203492,Fusobacteriaceae,45593,7.819754
1647988,Akkermansiaceae,45104,7.735885
543,Enterobacteriaceae,41453,7.109694
909930,Acidaminococcaceae,38227,6.556396
186803,Lachnospiraceae,37657,6.458634
2005525,Tannerellaceae,14975,2.568395
995019,Sutterellaceae,9364,1.606040
541000,Ruminococcaceae,3935,0.674900


In [273]:
####################
# genus: Read count 

genus = pd.DataFrame(list(zip(taxid, id_lineage['genus'], read_count)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count)*100
count_genus.to_csv('./result/count_genus_accuracy85.csv', encoding='utf-8')
count_genus

Unnamed: 0_level_0,name,read_count,percentage
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816,Bacteroides,222529,38.166432
838,Prevotella,59566,10.216294
848,Fusobacterium,45294,7.768472
239934,Akkermansia,45104,7.735885
1506553,Lachnoclostridium,24981,4.284546
375288,Parabacteroides,14733,2.526889
570,Klebsiella,9818,1.683906
40544,Sutterella,7850,1.346371
2039240,Anaerotignum,5873,1.007291
544,Citrobacter,3052,0.523455


In [274]:
####################
# species: Read count 

species = pd.DataFrame(list(zip(taxid, id_lineage['species'], read_count)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count)*100
count_species.to_csv('./result/count_species_accuracy85.csv', encoding='utf-8')
count_species

Unnamed: 0_level_0,name,read_count,percentage
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
310297,Bacteroides plebeius,84140,14.431034
363265,Prevotella stercorea,59285,10.168099
821,Bacteroides vulgatus,46209,7.925406
239935,Akkermansia muciniphila,45099,7.735027
46506,Bacteroides stercoris,43296,7.425791
856,Fusobacterium varium,37593,6.447657
47678,Bacteroides caccae,20441,3.505880
208479,[Clostridium] bolteae,18389,3.153937
823,Parabacteroides distasonis,8855,1.518740
573,Klebsiella pneumoniae,8742,1.499359


### Accuracy >= 80%

In [275]:
data = pd.read_csv('./accuracy_read/taxid_list80.csv', encoding = 'utf-8')

data.head()


Unnamed: 0,taxid,read_count
0,2,5756
1,89,1
2,126,1
3,154,1
4,292,1


In [276]:
taxid = list(data['taxid'])
read_count = list(data['read_count'])

total_count = len(data_accuracy80)

###############
# taxid to name

name = ncbi.get_taxid_translator(taxid)


###############
# taxid to rank
rank = ncbi.get_rank([taxid[0]])

In [277]:
##########################################################
# Create id_lineage dataframe to match the id with lineage

id_lineage = pd.DataFrame(columns= ['taxid'])
id_lineage['taxid'] = taxid
id_lineage.head()

Unnamed: 0,taxid
0,2
1,89
2,126
3,154
4,292


In [278]:
###############################################
# the lineage of id add to id_lineage dataframe

for i in range(len(taxid)):
    L = ncbi.get_lineage(taxid[i])
    
    for node in L:
        
        rank = ncbi.get_rank([node])
        id_lineage.at[i, list(rank.values())[0]] = str(node)



In [279]:
##########################
# superkingdom: Read count 

superkingdom = pd.DataFrame(list(zip(taxid, id_lineage['superkingdom'], read_count)), 
                            columns=['taxid', 'superkingdom', 'read_count'] )

count_superkingdom = pd.DataFrame(superkingdom.groupby('superkingdom')['read_count'].sum())
count_superkingdom.insert(0, 'name', '')


name = []
for i in count_superkingdom.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_superkingdom['name'] = name
count_superkingdom = count_superkingdom.sort_values(by=['read_count'], ascending=False)

count_superkingdom['percentage'] = (count_superkingdom['read_count'] / total_count)*100
count_superkingdom.to_csv('./result/count_superkingdom_accuracy80.csv', encoding='utf-8')
count_superkingdom


Unnamed: 0_level_0,name,read_count,percentage
superkingdom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Bacteria,725905,99.999724


In [280]:
####################
# phylum: Read count 

phylum = pd.DataFrame(list(zip(taxid, id_lineage['phylum'], read_count)), 
                            columns=['taxid', 'phylum', 'read_count'] )


count_phylum = pd.DataFrame(phylum.groupby('phylum')['read_count'].sum())
count_phylum.insert(0, 'name', '')


name = []
for i in count_phylum.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_phylum['name'] = name
count_phylum = count_phylum.sort_values(by=['read_count'], ascending=False)

count_phylum['percentage'] = (count_phylum['read_count'] / total_count)*100
count_phylum.to_csv('./result/count_phylum_accuracy80.csv', encoding='utf-8')
count_phylum

Unnamed: 0_level_0,name,read_count,percentage
phylum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
976,Bacteroidetes,385908,53.162182
1239,Firmicutes,134476,18.525238
1224,Proteobacteria,91476,12.601614
74201,Verrucomicrobia,54222,7.469552
32066,Fusobacteria,53965,7.434148
201174,Actinobacteria,23,0.003168
203691,Spirochaetes,2,0.000276
544448,Tenericutes,2,0.000276
1117,Cyanobacteria,1,0.000138
203682,Planctomycetes,1,0.000138


In [281]:
####################
# class: Read count 

class_ = pd.DataFrame(list(zip(taxid, id_lineage['class'], read_count)), 
                            columns=['taxid', 'class', 'read_count'] )


count_class = pd.DataFrame(class_.groupby('class')['read_count'].sum())
count_class.insert(0, 'name', '')


name = []
for i in count_class.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_class['name'] = name
count_class = count_class.sort_values(by=['read_count'], ascending=False)

count_class['percentage'] = (count_class['read_count'] / total_count)*100
count_class.to_csv('./result/count_class_accuracy80.csv', encoding='utf-8')
count_class

Unnamed: 0_level_0,name,read_count,percentage
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200643,Bacteroidia,385712,53.135181
1236,Gammaproteobacteria,76890,10.592266
909932,Negativicutes,71550,9.856635
186801,Clostridia,60521,8.337294
203494,Verrucomicrobiae,54202,7.466797
203490,Fusobacteriia,53965,7.434148
28216,Betaproteobacteria,13991,1.927382
91061,Bacilli,1012,0.139412
28221,Deltaproteobacteria,493,0.067915
526524,Erysipelotrichia,221,0.030445


In [282]:
####################
# order: Read count 

order = pd.DataFrame(list(zip(taxid, id_lineage['order'], read_count)), 
                            columns=['taxid', 'order', 'read_count'] )


count_order = pd.DataFrame(order.groupby('order')['read_count'].sum())
count_order.insert(0, 'name', '')

name = []
for i in count_order.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_order['name'] = name
count_order = count_order.sort_values(by=['read_count'], ascending=False)

count_order['percentage'] = (count_order['read_count'] / total_count)*100
count_order.to_csv('./result/count_order_accuracy80.csv', encoding='utf-8')
count_order

Unnamed: 0_level_0,name,read_count,percentage
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
171549,Bacteroidales,385634,53.124436
91347,Enterobacterales,74714,10.292503
186802,Clostridiales,60516,8.336605
48461,Verrucomicrobiales,54202,7.466797
203491,Fusobacteriales,53965,7.434148
1843488,Acidaminococcales,46522,6.40881
80840,Burkholderiales,13732,1.891702
186826,Lactobacillales,969,0.133488
213115,Desulfovibrionales,487,0.067088
135624,Aeromonadales,227,0.031271


In [283]:
####################
# family: Read count 

family = pd.DataFrame(list(zip(taxid, id_lineage['family'], read_count)), 
                            columns=['taxid', 'family', 'read_count'] )


count_family = pd.DataFrame(family.groupby('family')['read_count'].sum())
count_family.insert(0, 'name', '')

name = []
for i in count_family.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_family['name'] = name
count_family = count_family.sort_values(by=['read_count'], ascending=False)

count_family['percentage'] = (count_family['read_count'] / total_count)*100
count_family.to_csv('./result/count_family_accracy80.csv', encoding='utf-8')
count_family

Unnamed: 0_level_0,name,read_count,percentage
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
815,Bacteroidaceae,275857,38.001700
171552,Prevotellaceae,81905,11.283126
203492,Fusobacteriaceae,53960,7.433459
1647988,Akkermansiaceae,53481,7.367473
543,Enterobacteriaceae,52914,7.289364
186803,Lachnospiraceae,48635,6.699894
909930,Acidaminococcaceae,46522,6.408810
2005525,Tannerellaceae,18704,2.576639
995019,Sutterellaceae,11748,1.618389
541000,Ruminococcaceae,5574,0.767867


In [284]:
####################
# genus: Read count 

genus = pd.DataFrame(list(zip(taxid, id_lineage['genus'], read_count)), 
                            columns=['taxid', 'genus', 'read_count'] )


count_genus = pd.DataFrame(genus.groupby('genus')['read_count'].sum())
count_genus.insert(0, 'name', '')

name = []
for i in count_genus.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_genus['name'] = name
count_genus = count_genus.sort_values(by=['read_count'], ascending=False)

count_genus['percentage'] = (count_genus['read_count'] / total_count)*100
count_genus.to_csv('./result/count_genus_accuracy80.csv', encoding='utf-8')
count_genus

Unnamed: 0_level_0,name,read_count,percentage
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
816,Bacteroides,275274,37.921387
838,Prevotella,71370,9.831838
848,Fusobacterium,53593,7.382902
239934,Akkermansia,53481,7.367473
1506553,Lachnoclostridium,32117,4.424396
375288,Parabacteroides,18436,2.539719
570,Klebsiella,12927,1.780807
40544,Sutterella,9455,1.302508
2039240,Anaerotignum,7093,0.977122
544,Citrobacter,4155,0.572387


In [285]:
####################
# species: Read count 

species = pd.DataFrame(list(zip(taxid, id_lineage['species'], read_count)), 
                            columns=['taxid', 'species', 'read_count'] )


count_species = pd.DataFrame(species.groupby('species')['read_count'].sum())
count_species.insert(0, 'name', '')

name = []
for i in count_species.index.values:
    
    name.append(list(ncbi.get_taxid_translator([i]).values())[0])
    
    
count_species['name'] = name
count_species = count_species.sort_values(by=['read_count'], ascending=False)

count_species['percentage'] = (count_species['read_count'] / total_count)*100
count_species.to_csv('./result/count_species_accuracy80.csv', encoding='utf-8')
count_species

Unnamed: 0_level_0,name,read_count,percentage
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
310297,Bacteroides plebeius,104286,14.366303
363265,Prevotella stercorea,70758,9.747530
821,Bacteroides vulgatus,55343,7.623979
46506,Bacteroides stercoris,54435,7.498894
239935,Akkermansia muciniphila,53470,7.365957
856,Fusobacterium varium,44944,6.191427
47678,Bacteroides caccae,24788,3.414762
208479,[Clostridium] bolteae,22948,3.161287
573,Klebsiella pneumoniae,11471,1.580230
823,Parabacteroides distasonis,11243,1.548821
