In [2]:
import pandas as pd
from collections import Counter
from Bio.Blast import NCBIXML
import pandas as pd

In [3]:
def xml2df(filepath):
    """convert BLAST xml output to pd.Dataframe."""
    
    dct_lst=[]
    #resultFilepath="/data/mitsuki/out/altorf/evolve/result/GCF_000010665.1_ASM1066v1.xml"
    with open(filepath) as f:
        blastRecords = NCBIXML.parse(f)
        for rec in blastRecords:
            queryName=rec.query
            category=queryName.split('|')[0]
            queryLength=rec.query_length
            for alignment in rec.alignments:
                sbjctName=alignment.title
                sbjctLength=alignment.length
                for hsp in alignment.hsps:
                    dct={}
                    dct["category"]=category
                    dct["query_name"]=queryName
                    dct["sbjct_name"]=sbjctName
                    dct["evalue"]=hsp.expect
                    dct["query_length"]=queryLength
                    dct["sbjct_length"]=sbjctLength
                    dct["query_start"]=hsp.query_start
                    dct["quer_end"]=hsp.query_end
                    dct["sbjct_start"]=hsp.sbjct_start
                    dct["sbjct_end"]=hsp.sbjct_end
                    dct_lst.append(dct)
                    
    result_df=pd.DataFrame(dct_lst)
    result_df=result_df[["category","query_name","sbjct_name","evalue"]]
    return result_df

In [4]:
def categorize_result(result_df):
    """convert BLAST xml output to pd.Dataframe."""
    thres_lst=[1,0.1,0.01,1e-3,1e-4,1e-5]
    dct_lst=[]
    for thres in thres_lst:
        dct={}
        dct["thres"]=thres
        filtered_df=result_df[result_df["evalue"]<thres]
        #filtered_df=filtered_df.drop_duplicates(subset=['qseqid'])
        dct.update(Counter(filtered_df["category"]))
        dct_lst.append(dct)
    count_df=pd.DataFrame(dct_lst)
    
    count_df=count_df.fillna(0)
    name_lst=[]
    for prefix in ["F","DSF","DTSF"]:
        for i in range(2,7):
            name=prefix+str(i)
            name_lst.append(name)
    count_df[name_lst]=count_df[name_lst].astype(int)
    count_df=count_df[["thres"]+name_lst]
    
    return count_df

In [5]:
resultFilepath="/data/mitsuki/out/altorf/evolve/result/GCF_000010665.1_ASM1066v1.xml"
result_df=xml2df(resultFilepath)
print(result_df.shape)
result_df.head()

(22020, 4)


Unnamed: 0,category,query_name,sbjct_name,evalue
0,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|107033 WP_015850381.1 HAD family...,0.311454
1,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|74140 WP_015850381.1 HAD family ...,0.311454
2,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|41247 WP_015850381.1 HAD family ...,0.311454
3,F3,F3|lcl|NC_012796.1_cds_WP_012749588.1_3 [locus...,gnl|BL_ORD_ID|8354 WP_015850381.1 HAD family h...,0.311454
4,F3,F3|lcl|NC_012796.1_cds_WP_012749591.1_6 [locus...,gnl|BL_ORD_ID|102401 WP_012612578.1 MULTISPECI...,0.093111


In [24]:
for alignment in rec.alignments:
    sbjctName=alignment.title
    sbjctLength=alignment.length
    print(sbjctName)
    for hsp in alignment.hsps:
        print(hsp.query)
        print(hsp.match)
        print(hsp.sbjct)
    print()

gnl|BL_ORD_ID|107033 WP_015850381.1 HAD family hydrolase [Desulfovibrio salexigens]
HPGHGRRGERGEGSRKPGSPALPQDRHHDRRRRGRGAHPHPAFDLFLPALRKAHFKRLRLH-RPAAAVPRVQRRFRAFHQGRRGIVPVSHGPH
HP    R   G+G++K     LP+D+ +              +D F+P L K   + L  H RP A +P V   F A   G++ I  +S+ PH
HPVDAYRKFVGDGAKKLAWRVLPEDKQNQED-----------YDQFVPVLLKKFEEELNKHVRPYAGIPEVLADFIA--AGKK-IAILSNKPH

gnl|BL_ORD_ID|74140 WP_015850381.1 HAD family hydrolase [Desulfovibrio salexigens]
HPGHGRRGERGEGSRKPGSPALPQDRHHDRRRRGRGAHPHPAFDLFLPALRKAHFKRLRLH-RPAAAVPRVQRRFRAFHQGRRGIVPVSHGPH
HP    R   G+G++K     LP+D+ +              +D F+P L K   + L  H RP A +P V   F A   G++ I  +S+ PH
HPVDAYRKFVGDGAKKLAWRVLPEDKQNQED-----------YDQFVPVLLKKFEEELNKHVRPYAGIPEVLADFIA--AGKK-IAILSNKPH

gnl|BL_ORD_ID|41247 WP_015850381.1 HAD family hydrolase [Desulfovibrio salexigens]
HPGHGRRGERGEGSRKPGSPALPQDRHHDRRRRGRGAHPHPAFDLFLPALRKAHFKRLRLH-RPAAAVPRVQRRFRAFHQGRRGIVPVSHGPH
HP    R   G+G++K     LP+D+ +              +D F+P L K   + L  H RP A +P V   F A   G++ I  +S+

In [15]:
resultFilepath="/data/mitsuki/out/altorf/evolve/result/GCF_000010665.1_ASM1066v1.xml"
blastRecords = NCBIXML.parse(open(resultFilepath))
for i,rec in enumerate(blastRecords):
    print(rec.query)
    if i>=11:
        break

F2|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]
F3|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]
F4|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]
F5|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]
F6|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]
F2|lcl|NC_012796.1_cds_WP_012749587.1_2 [locus_tag=DMR_RS00010] [protein=DNA polymerase II

In [11]:
rec=next(blastRecords)

In [13]:
rec.query

'F4|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]'

In [3]:
type(blastRecords)

generator

'F2|lcl|NC_012796.1_cds_WP_012749586.1_1 [locus_tag=DMR_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_012749586.1] [location=126..1466] [gbkey=CDS]'

In [None]:
set(result_df["category"])

In [6]:
count_df=categorize_result(result_df)
count_df.head()

Unnamed: 0,thres,F2,F3,F4,F5,F6,DSF2,DSF3,DSF4,DSF5,DSF6,DTSF2,DTSF3,DTSF4,DTSF5,DTSF6
0,1.0,1820,1668,2972,880,1452,1344,1276,2212,656,1020,1116,1236,2488,668,1212
1,0.1,500,404,660,156,280,268,184,284,108,140,184,176,240,100,188
2,0.01,164,192,276,56,120,36,16,36,8,4,24,24,24,20,16
3,0.001,88,124,180,24,104,0,4,0,0,0,0,4,8,4,0
4,0.0001,72,100,164,12,100,0,0,0,0,0,0,0,0,0,0


In [1]:
txtFilepath="../../createdatabase/out/genus_872.txt"
catalog_df=pd.read_csv(txtFilepath)
catalog_df.head()

NameError: name 'pd' is not defined

In [None]:
for basename in catalog_df["ftp_basename"]:
    print()
    resultFilepath="/data/mitsuki/out/altorf/evolve/result/{}.xml".format(basename)
    print()
    print("PROCESSING {}".format(resultFilepath))
    result_df=xml2df(resultFilepath)
    count_df=categorize_result(result_df)
    print(count_df)