In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from functools import reduce
from matplotlib import rcParams

import runtime as rt
from theme import colors

# font parameters
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 12

## Monomer

In [2]:
# parameters
filepaths = {
    "carbonara_minseqid": glob("benchmark_data/carbonara/monomers/minseqid/*.fasta"),
    "carbonara_minseqsim": glob("benchmark_data/carbonara/monomers/minseqsim/*.fasta"),
}

# read sequences
seqs = {}
for key in filepaths:
    seqs[key] = {}
    for fp in filepaths[key]:
        name = os.path.basename(fp).split('.')[0]
        _, seq = rt.read_fasta(fp)
        seqs[key][name] = seq 

In [3]:
# output one fasta file per method with generated sequences
for key in seqs:
    with open("fasta/{}.fasta".format(key), 'w') as fs:
        #with open("fasta/{}_{}.fasta".format(key,sid), 'w') as fs:
        for sid in seqs[key]:
            #fs.write(">{}\n{}".format(sid, seqs[key][sid][0]))
            fs.write(">{}\n{}\n\n".format(sid, seqs[key][sid][0]))

In [4]:
sids_set = set()
sids_set.add('a')
sids_set

{'a'}

In [5]:
# parse blastp results
blast_results = []
for tab_filepath in glob("fasta/*.tab"):
    sids_set = set()
    with open(tab_filepath, 'r') as fs:
        for line in fs:
            if not line.startswith('#'):
                if line.split('\t')[0] not in sids_set:
                    blast_results.append({
                        'method': tab_filepath.replace("nomatch_", "").split('/')[-1].split('.')[0],
                        'sid': line.split('\t')[0],
                        'evalue': float(line.split('\t')[10]),
                    })
                    sids_set.add(line.split('\t')[0])

dfb = pd.DataFrame(blast_results)

In [6]:
dfb.to_csv("results/monomers_sequence_evalue.csv", index=False)
dfb

Unnamed: 0,method,sid,evalue
0,carbonara_minseqid,4CRP_A,2.540000e-07
1,carbonara_minseqid,4JJC_A,6.130000e-05
2,carbonara_minseqid,4J9F_A,2.720000e-06
3,carbonara_minseqid,1ZH8_A,5.500000e-48
4,carbonara_minseqid,3EG1_B,2.050000e-06
...,...,...,...
279,carbonara_minseqsim,1P1A_A,6.500000e+00
280,carbonara_minseqsim,2KKJ_A,1.100000e+01
281,carbonara_minseqsim,5CH4_G,1.100000e+01
282,carbonara_minseqsim,5LV6_A,2.000000e+01
