# Score Benchmarking

For the first search , compare the performance for searching

We don't want to rerun feature finding and file conversion again and again, so we create a copy and continue runs.

In [None]:
from alphapept.settings import load_settings
from alphapept.paths import DEFAULT_SETTINGS_PATH
import alphapept.interface
import alphapept.io
import matplotlib.pyplot as plt
import os 
    
BASE_PATH = 'F:/AP_Paper_Benchmark/PXD028735/top_n_optimization'

In [None]:
settings = load_settings(DEFAULT_SETTINGS_PATH)

settings['experiment']['file_paths'] =  [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.raw')]
settings['experiment']['fasta_paths'] = [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.fasta')]

settings = alphapept.interface.import_raw_data(settings)
settings = alphapept.interface.feature_finding(settings)

In [None]:
import shutil

_ = settings['experiment']['file_paths'][0]
base, ext = os.path.splitext(_)
ms_file_path = base+'.ms_data.hdf'
ms_file_path_bkup = ms_file_path+'.bkup'

if os.path.isfile(ms_file_path_bkup):
    os.remove(ms_file_path_bkup)

os.rename(ms_file_path, ms_file_path_bkup)

In [None]:
from tqdm import tqdm as tqdm
import pandas as pd

def set_settings(top_n, method, ini_score):
    settings = load_settings(DEFAULT_SETTINGS_PATH)

    settings['workflow']['continue_runs'] = True

    settings['experiment']['file_paths'] =  [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.raw')]
    settings['experiment']['fasta_paths'] = [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.fasta')]
    settings['search']['top_n'] = top_n
    settings['score']['method'] = method
    settings['score']['ml_ini_score'] = ini_score
    
    return settings

settings_list = []
for top_n in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,18,20,25,30,35,40]:
    for method in ['x_tandem','random_forest','generic_score','morpheus']:
        if method == 'random_forest':
            for ini_score in ['x_tandem','generic_score','hits']:
                settings = set_settings(top_n, method, ini_score)
                settings_list.append(settings)
        else:
            settings = set_settings(top_n, method, 'hits')
            settings_list.append(settings)
            
benchmark = []

for settings in tqdm(settings_list):
    
    if os.path.isfile(ms_file_path):
        os.remove(ms_file_path)
    
    shutil.copyfile(ms_file_path_bkup, ms_file_path)
        
    settings_ = alphapept.interface.run_complete_workflow(settings)

    _ = settings['experiment']['file_paths'][0]
    base, ext = os.path.splitext(_)

    ms_file = alphapept.io.MS_Data_File(base+'.ms_data.hdf')

    df = pd.read_hdf(settings['experiment']['results_path'], 'protein_fdr')
    
    time = settings['summary']['timing']['total (min)']
    decoy = df['decoy'].sum()
    target = df['target'].sum()
    top_n = settings['search']['top_n']
    method = settings['score']['method']
    ini_score = settings['score']['ml_ini_score']

    plt.show()
    
    benchmark.append((top_n, method, ini_score, target, decoy, time))
    
    if os.path.isfile(settings_['experiment']['results_path']):
        os.remove(settings_['experiment']['results_path'])

benchmark_df = pd.DataFrame(benchmark, columns = ['top_n','method','ini_score','target','decoy','time'])

In [None]:
benchmark_df['method_'] = benchmark_df.apply(lambda row: row['method'] + ' with ' + str(row['ini_score']) if row['method'] == 'random_forest' else row['method'], axis=1)

In [None]:
dark_blue = '#17212b'
light_blue = '#3dc5ef'
teal= '#42dee1'
green = '#6eecb9'
yellow = '#eef5b3'
hfont = {'fontname':'Arial', 'size':10}

colors = [dark_blue, light_blue, teal, green, yellow]


In [None]:
plt.figure(figsize=(7,7))

sns.scatterplot(data=benchmark_df[benchmark_df['top_n'] > 1], x='top_n', y='target', hue='method_', alpha=0.5)

plt.title('Top N vs number of identified precursors after FDR')
plt.xlabel('Top N')
plt.ylabel('Top N')
plt.tight_layout()
plt.xticks(**hfont)
plt.yticks(**hfont)
plt.ylim([0, 35000])
plt.legend(loc='lower right')
plt.savefig('figures/SI_01.pdf')  
plt.show()