In [1]:
import pandas as pd
from collections import Counter
from Bio.Blast import NCBIXML
import pandas as pd

In [2]:
def xml2df(filepath):
    """convert BLAST xml output to pd.Dataframe."""
    
    dct_lst=[]
    #resultFilepath="/data/mitsuki/out/altorf/evolve/result/GCF_000010665.1_ASM1066v1.xml"
    with open(filepath) as f:
        blastRecords = NCBIXML.parse(f)
        for rec in blastRecords:
            queryName=rec.query
            category=queryName.split('|')[0]
            queryLength=rec.query_length
            for alignment in rec.alignments:
                sbjctName=alignment.title
                sbjctLength=alignment.length
                for hsp in alignment.hsps:
                    dct={}
                    dct["category"]=category
                    dct["query_name"]=queryName
                    dct["sbjct_name"]=sbjctName
                    dct["evalue"]=hsp.expect
                    dct["query_length"]=queryLength
                    dct["sbjct_length"]=sbjctLength
                    dct["query_start"]=hsp.query_start
                    dct["quer_end"]=hsp.query_end
                    dct["sbjct_start"]=hsp.sbjct_start
                    dct["sbjct_end"]=hsp.sbjct_end
                    dct_lst.append(dct)
                    
    result_df=pd.DataFrame(dct_lst)
    result_df=result_df[["category","query_name","sbjct_name","evalue"]]
    return result_df

In [3]:
def categorize_result(result_df):
    """convert BLAST xml output to pd.Dataframe."""
    thres_lst=[1,0.1,0.01,1e-3,1e-4,1e-5]
    dct_lst=[]
    for thres in thres_lst:
        dct={}
        dct["thres"]=thres
        filtered_df=result_df[result_df["evalue"]<thres]
        #filtered_df=filtered_df.drop_duplicates(subset=['qseqid'])
        dct.update(Counter(filtered_df["category"]))
        dct_lst.append(dct)
    count_df=pd.DataFrame(dct_lst)
    
    count_df=count_df.fillna(0)
    name_lst=[]
    for prefix in ["F","DRF","DSF"]:
        for i in range(2,7):
            name=prefix+str(i)
            name_lst.append(name)
    count_df[name_lst]=count_df[name_lst].astype(int)
    count_df=count_df[["thres"]+name_lst]
    
    return count_df

In [4]:
resultFilepath="/data/mitsuki/out/altorf/evolve/result/GCF_000010665.1_ASM1066v1.xml"
result_df=xml2df(resultFilepath)
print(result_df.shape)
result_df.head()

(22441, 4)


Unnamed: 0,category,query_name,sbjct_name,evalue
0,F3,F3|lcl|NC_012796.1_cds_WP_012749587.1_2 [locus...,gnl|BL_ORD_ID|8119 WP_012611520.1 ribosomal la...,0.848814
1,F4,F4|lcl|NC_012796.1_cds_WP_012749587.1_2 [locus...,gnl|BL_ORD_ID|5809 WP_011367756.1 magnesium tr...,0.015497
2,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|13754 WP_015850381.1 HAD family ...,0.077864
3,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|27358 WP_047170687.1 hypothetica...,0.342326
4,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|28120 WP_062251346.1 hypothetica...,0.415651


In [5]:
count_df=categorize_result(result_df)
count_df.head()

Unnamed: 0,thres,F2,F3,F4,F5,F6,DRF2,DRF3,DRF4,DRF5,DRF6,DSF2,DSF3,DSF4,DSF5,DSF6
0,1.0,1430,1561,2697,831,1318,1291,1183,2349,656,2500,1093,1343,2347,659,1183
1,0.1,284,269,418,131,213,269,181,269,83,1096,201,180,307,89,148
2,0.01,77,77,102,21,44,52,25,30,7,674,33,24,32,16,15
3,0.001,35,36,57,9,26,10,1,13,1,459,10,4,4,0,1
4,0.0001,21,27,43,6,25,2,0,8,0,311,2,0,0,0,0


In [8]:
txtFilepath="../../createdatabase/out/genus_872.txt"
catalog_df=pd.read_csv(txtFilepath)
catalog_df.head()

Unnamed: 0,taxid,kingdom,phylum,class,order,family,genus,species,count_real,count_sim,diff,ftp_basename,organism_name,genetic_code,G+C
0,525146,-1,1224,28221,213115,194924,872,876,0.851789,0.77774,0.074048,GCF_000022125.1_ASM2212v1,Desulfovibrio desulfuricans subsp. desulfurica...,11,0.580722
1,883,-1,1224,28221,213115,194924,872,881,1.619893,1.407806,0.212088,GCF_000021385.1_ASM2138v1,Desulfovibrio vulgaris str. 'Miyazaki F',11,0.67109
2,901,-1,1224,28221,213115,194924,872,901,1.387866,1.183206,0.204661,GCF_900116045.1_DESPIGER,Desulfovibrio piger,11,0.641799
3,526222,-1,1224,28221,213115,194924,872,880,0.594646,0.536652,0.057994,GCF_000023445.1_ASM2344v1,Desulfovibrio salexigens DSM 2638,11,0.470928
4,641491,-1,1224,28221,213115,194924,872,876,1.622397,1.369687,0.252709,GCF_000189295.2_ASM18929v2,Desulfovibrio desulfuricans ND132,11,0.652094


In [10]:
for basename in catalog_df["ftp_basename"]:
    print()
    resultFilepath="/data/mitsuki/out/altorf/evolve/result/{}.xml".format(basename)
    print()
    print("PROCESSING {}".format(resultFilepath))
    result_df=xml2df(resultFilepath)
    count_df=categorize_result(result_df)
    print(count_df)


PROCESSING /data/mitsuki/out/altorf/evolve/result/GCF_000022125.1_ASM2212v1.xml
     thres   F2   F3    F4   F5   F6  DRF2  DRF3  DRF4  DRF5  DRF6  DSF2  \
0  1.00000  637  684  1425  577  748   557   543  1165   445  1095   519   
1  0.10000  102   98   194   80  101    73    68   143    71   439    74   
2  0.01000   19   24    36   15   18     9    11    13     2   243     8   
3  0.00100   12   10    10    3    3     1     0     2     0   145     0   
4  0.00010   11   10     7    3    2     0     0     0     0    97     0   
5  0.00001    8    8     4    3    2     0     0     0     0    66     0   

   DSF3  DSF4  DSF5  DSF6  
0   526  1153   468   628  
1    54   145    54    83  
2     7    18     3    13  
3     1     2     0     0  
4     0     0     0     0  
5     0     0     0     0  

PROCESSING /data/mitsuki/out/altorf/evolve/result/GCF_000021385.1_ASM2138v1.xml
     thres   F2    F3    F4   F5    F6  DRF2  DRF3  DRF4  DRF5  DRF6  DSF2  \
0  1.00000  735  1173  1786  61