In [13]:
import pandas as pd
import os
from blastScripts import make_queries as mq

## Run reciprocal blast

In [154]:
def reciprocal_blastp(database_dir, query_file, out_dirname, proteome_dir, reference_proteome_path):
    databases  = [db for db in os.listdir(database_dir) if db != '.DS_Store']
    proteomes = [p.replace('.faa', '') for p in os.listdir(proteome_dir)]
    for database in databases:
        if database not in proteomes:
            raise Exception(f'{proteome} not present in proteomes')
    os.makedirs(out_dirname)
    forward_results_dir = os.path.join(out_dirname, 'forward_blast_results')
    multi_database_query(database_dir, query_file, forward_results_dir)

    rblast_queries_dir = os.path.join(out_dirname, 'forward_best_hits')
    mq.write_best_hit_queries(proteome_dir, forward_results_dir, rblast_queries_dir)
    
    
    ref_db_name = reference_proteome_path.replace('.faa', '_blast_db')
    #os.makedirs('reference_db')
    reference_db_path = os.path.join(out_dirname, 'reference_db/reference_db')
    print(reference_db_path)
    !makeblastdb -in $reference_proteome_path -dbtype 'prot' -out $reference_db_path
    recip_blast(rblast_queries_dir, reference_db_path, os.path.join(out_dirname,'reciprocal_blast_results'))
    
    

In [2]:
query_file = '../Pseudomonas_1_csrA.faa'
database_dir = '../test_blast_db/'
proteome_dir = '../proteomes'
reference_proteome_path = '../P_aeruginosa_PA01.faa'
out_dirname = '../test_rblast_results_Psuedo'
  
#reciprocal_blastp(database_dir, query_file, out_dirname, proteome_dir, reference_proteome_path)

### get metedata for human gut microbiome  
1. Filter for only the ~4400 representative species  
FTP downloads: 
http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.1/  


In [16]:
metadata_df =  pd.read_csv('../gut_microbiome/genomes-all_metadata.tsv', sep = '\t')
metadata_df = metadata_df.set_index('Genome')
### get accessions for species representative
rep_accessions = list(set(metadata_df['Species_rep']))
metadata_df_repr = metadata_df.loc[rep_accessions,:]

## Run commented out code only once to get the proteomes and build the blast database

### Download proteomes for all representative species

In [10]:
def download_mgnify_hsgut_proteomes(rep_accessions):
    for acc in rep_accessions:
        url = f'https://www.ebi.ac.uk/metagenomics/api/v1/genomes/{acc}/downloads/{acc}.faa'
        out_fp = f'../Proteomes/{acc}.faa'
        !curl $url > $out_fp
#download_mgnify_hsgut_proteomes(rep_accessions)

### Make blast database from each proteome file

In [11]:
def make_blastp_databases(proteome_dir, out_dir):
    """Make a blastp database for each proteome within a given directory"""
    lst_assembly_accessions = [acc.replace('.faa', '') for acc in os.listdir(proteome_dir)]
    for acc in lst_assembly_accessions:
        proteome_path = f'{proteome_dir}/{acc}.faa'
        out = f'{out_dir}/{acc}/{acc}_db'
        !makeblastdb -in $proteome_path -dbtype 'prot' -out $out

In [5]:
#make_blastp_databases('../proteomes', './blast_databases')

### Query each database with the original sequence to which you are finding orthologs  

In [12]:
def multi_database_query(database_dir, query_filepath, out_dir_path):
    """Query each blastp database within the database_dir and make a hits text file in the out_dir"""
        
    if not out_dir_path:
        out_dir_path = './forward_blast_results'
    if not os.path.exists(out_dir_path):
        os.makedirs(out_dir_path)
    for acc in [acc for acc in os.listdir(database_dir) if acc != '.DS_Store']:
        base_filepath = os.path.join(database_dir, f'{acc}/{acc}_db')
        out_filepath = os.path.join(out_dir_path, f'{acc}_blastpOut.txt')
        print(out_filepath)
        !blastp -query $query_file -db $base_filepath -num_descriptions 5 -num_alignments 5 -out $out_filepath
        
# query_file = '../B_subtilis_csra.faa'
# database_dir = '../blast_databases'
# out_dir_path = '../forward_blast_results_1'
#multi_database_query(database_dir, query_file, out_dir_path)    

### Reciprocal blast  

In [15]:
def recip_blast(rblast_queries_dir, rblast_db_path, outDir):
    """ """
    os.makedirs(outDir)
    query_files = [file for file in os.listdir(rblast_queries_dir) if file.endswith('.faa')]
    for query in query_files:
        outfile_name = query.replace('.faa', '_blastpOut.txt')
        out_path = os.path.join(outDir, f'{outfile_name}')
        query_path  = os.path.join(rblast_queries_dir, query)
        print(query_path,rblast_db_path, out_path, )
        !blastp -query $query_path -db $rblast_db_path -num_descriptions 5 -num_alignments 5 -out $out_path

In [53]:
path_to_rblast_Pseudo = '../all_rblast_results_Psuedo/reciprocal_blast_results'
path_to_rblast_subt =   '../all_rblast_results_subtilis/reciprocal_blast_results'
path_to_fblast_Pseudo = '../all_rblast_results_Psuedo/forward_blast_results'
path_to_fblast_subt =   '../all_rblast_results_subtilis/forward_blast_results'


In [80]:
def get_blastbesthit_dict(path_to_blast_results, accession_length=13):
    """Return a dict {accession:protein_id} with the accession of genome and the best hit in the source genome"""
    
    files =os.listdir(path_to_blast_results)
    paths = [os.path.join(path_to_blast_results,file) for file in files]
    best_hits_dict = {}
    for file, path in zip(files, paths):
        accession = file[:accession_length]
        with open(path, 'r') as f:
            for i,line in enumerate(f):
                if line.startswith('>'):
                    prot_id = line.split(' ')[0][1:]
                    best_hits_dict[accession]=prot_id
                    break
    return best_hits_dict


best_subt_rhits_dict = get_blastbesthit_dict(path_to_rblast_subt, accession_length=13)
best_pseudo_rhits_dict = get_blastbesthit_dict(path_to_rblast_Pseudo, accession_length=13)
best_subt_fhits_dict = get_blastbesthit_dict(path_to_fblast_subt, accession_length=13)
best_pseudo_fhits_dict = get_blastbesthit_dict(path_to_fblast_Pseudo, accession_length=13)
subt_df = pd.DataFrame.from_dict(best_subt_rhits_dict, orient='index').merge(pd.DataFrame.from_dict(best_subt_fhits_dict, orient='index'), left_index=True, right_index=True)
pseudo_df = pd.DataFrame.from_dict(best_pseudo_rhits_dict, orient='index').merge(pd.DataFrame.from_dict(best_pseudo_fhits_dict, orient='index'), left_index=True, right_index=True)
subt_df.columns = ['r_bbhit', 'f_bbhit']
pseudo_df.columns = ['r_bbhit', 'f_bbhit']

In [81]:
csrA_prot_ID = 'NP_391417.1'
subt_ortho_d = get_species_with_orthos(csrA_prot_ID, best_rHits_subt_d)
subt_ortho_d

subt_filt_df = subt_df.loc[subt_df['r_bbhit']== csrA_prot_ID,:] 
pseudo_filt_df = pseudo_df.loc[pseudo_df['r_bbhit']== csrA_prot_ID,:] 
subt_filt_df

Unnamed: 0,r_bbhit,f_bbhit
MGYG000003619,NP_391417.1,MGYG000003619_01053
MGYG000003060,NP_391417.1,MGYG000003060_00284
MGYG000002683,NP_391417.1,MGYG000002683_00326
MGYG000000626,NP_391417.1,MGYG000000626_00410
MGYG000002462,NP_391417.1,MGYG000002462_03497
...,...,...
MGYG000000864,NP_391417.1,MGYG000000864_02155
MGYG000001491,NP_391417.1,MGYG000001491_01283
MGYG000001312,NP_391417.1,MGYG000001312_01817
MGYG000001486,NP_391417.1,MGYG000001486_01494


In [150]:

acc_taxid_d = {acc:i for i,acc in enumerate(rep_accessions)}
prot_id_taxid_d = {}
with open('../all_hs_gut_proteomes.faa', 'r') as f:
    for line in f:
        if line.startswith('>'):
            prot_id  = line.split(' ')[0][1:]
            acc = prot_id.split('_')[0]
            prot_id_taxid_d[prot_id] = acc_taxid_d[acc]
            
with open('../TaxID_map.txt', 'w') as f:
    for prot_id, taxid in prot_id_taxid_d.items():
        f.write(f'{prot_id} {taxid}\n')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 2874: invalid start byte

NameError: name 'rblast_queries_dir' is not defined