# Paper Figure 4: ML scoring

Now with arabidopsis FDR estimate.

In [1]:
import os
from test_files import get_files, prepare_test

files = ['thermo_HeLa.raw', 'human.fasta', 'arabidopsis.fasta']

In [2]:
tmp_folder = os.path.join(os.getcwd(),'temp/')
test_folder = 'fig4'

get_files(tmp_folder, files)
prepare_test(files, tmp_folder, test_folder)

Creating dir F:\alphapept\sandbox\temp/fig4.


## Search raw file

In [3]:
import alphapept.settings
import alphapept.paths
import alphapept.interface

test_folder = os.path.join(tmp_folder, test_folder)

file_name = os.path.join(test_folder, files[0])
settings = alphapept.settings.load_settings(alphapept.paths.DEFAULT_SETTINGS_PATH)
settings['experiment']['file_paths'] = [file_name]
settings['experiment']['fasta_paths'] = [os.path.join(test_folder, _) for _ in ['human.fasta', 'arabidopsis.fasta']]
#settings_ = alphapept.interface.run_complete_workflow(settings)
base, ext = os.path.splitext(file_name)

In [4]:
import alphapept.io
from alphapept.score import get_ML_features, filter_with_ML, filter_with_x_tandem, cut_global_fdr, train_RF, filter_score, filter_precursor, cut_fdr

ms_file = alphapept.io.MS_Data_File(os.path.join(test_folder,'thermo_HeLa.ms_data.hdf'))

df = ms_file.read(dataset_name='second_search')
df_ = get_ML_features(df)

df = df_.copy()

OSError: Unable to open file (unable to open file: name = 'F:\alphapept\sandbox\temp\fig4\thermo_HeLa.ms_data.hdf', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import logging
import numpy as np

import matplotlib.pyplot as plt

exclude_features = ['precursor_idx','ion_idx','fasta_index','feature_rank','raw_rank','rank','db_idx', 'feature_idx', 'precursor', 'query_idx', 'raw_idx','sequence','decoy','naked_sequence','target']
train_fdr_level = 0.1
ini_score = 'x_tandem'
min_train = 1000
test_size = 0.8
max_depth = [5,25,50]
max_leaf_nodes = [150,200,250]
n_jobs=-1
scoring='accuracy'
plot = True
random_state = 42

features = [_ for _ in df.columns if _ not in exclude_features]

# Setup ML pipeline
scaler = StandardScaler()
rfc = RandomForestClassifier(random_state=random_state) # class_weight={False:1,True:5},
## Initiate scaling + classification pipeline
pipeline = Pipeline([('scaler', scaler), ('clf', rfc)])
parameters = {'clf__max_depth':(max_depth), 'clf__max_leaf_nodes': (max_leaf_nodes)}
## Setup grid search framework for parameter selection and internal cross validation
cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, scoring=scoring,
                 verbose=0,return_train_score=True,n_jobs=n_jobs)

# Prepare target and decoy df
df['decoy'] = df['sequence'].str[-1].str.islower()
df['target'] = ~df['decoy']
df['score'] = df[ini_score]
dfT = df[~df.decoy]
dfD = df[df.decoy]

# Select high scoring targets (<= train_fdr_level)
df_prescore = filter_score(df)
df_prescore = filter_precursor(df_prescore)
scored = cut_fdr(df_prescore, fdr_level = train_fdr_level, plot=False)[1]
highT = scored[scored.decoy==False]
dfT_high = dfT[dfT['query_idx'].isin(highT.query_idx)]
dfT_high = dfT_high[dfT_high['db_idx'].isin(highT.db_idx)]

# Determine the number of psms for semi-supervised learning
n_train = int(dfT_high.shape[0])
if dfD.shape[0] < n_train:
    n_train = int(dfD.shape[0])
    logging.info("The total number of available decoys is lower than the initial set of high scoring targets.")
if n_train < min_train:
    raise ValueError("There are fewer high scoring targets or decoys than required by 'min_train'.")

# Subset the targets and decoys datasets to result in a balanced dataset
df_training = dfT_high.sample(n=n_train, random_state=random_state).append(dfD.sample(n=n_train, random_state=random_state))

# Select training and test sets
X = df_training[features]
y = df_training['target'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=test_size, random_state=random_state, stratify=y.values)

# Train the classifier on the training set via 5-fold cross-validation and subsequently test on the test set
logging.info('Training & cross-validation on {} targets and {} decoys'.format(np.sum(y_train),X_train.shape[0]-np.sum(y_train)))
cv.fit(X_train,y_train)

logging.info('The best parameters selected by 5-fold cross-validation were {}'.format(cv.best_params_))
logging.info('The train {} was {}'.format(scoring, cv.score(X_train, y_train)))
logging.info('Testing on {} targets and {} decoys'.format(np.sum(y_test),X_test.shape[0]-np.sum(y_test)))
logging.info('The test {} was {}'.format(scoring, cv.score(X_test, y_test)))

feature_importances=cv.best_estimator_.named_steps['clf'].feature_importances_
indices = np.argsort(feature_importances)[::-1][:40]

top_features = X.columns[indices][:40]
top_score = feature_importances[indices][:40]

feature_dict = dict(zip(top_features, top_score))
logging.info(f"Top features {feature_dict}")

# Inspect feature importances
if plot:
    import seaborn as sns
    g = sns.barplot(y=X.columns[indices][:40],
                    x = feature_importances[indices][:40],
                    orient='h', palette='RdBu')
    g.set_xlabel("Relative importance",fontsize=12)
    g.set_ylabel("Features",fontsize=12)
    g.tick_params(labelsize=9)
    g.set_title("Feature importance")
    plt.show()



In [None]:
dark_blue = '#17212b'
light_blue = '#3dc5ef'
teal= '#42dee1'
green = '#6eecb9'
yellow = '#eef5b3'
hfont = {'fontname':'Arial', 'size':10}

colors = [dark_blue, light_blue, teal, green, yellow]

plt.figure(figsize=(2.363*1.5,2.363)) #60x60

height=0.5
top_n = 10
color_index = 3

y = list(X.columns[indices][:top_n][::-1])
x = feature_importances[indices][:top_n][::-1]

remainder = feature_importances[indices][top_n:].sum()

x = np.append(remainder, x)
y = ['Remainder'] + y

plt.barh(y, x, height, color = colors[color_index], linewidth=1, edgecolor='black')

plt.xticks(**hfont)
plt.yticks(**hfont)
plt.xlabel('Relative importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('figures/04_feature_importance.pdf')  
plt.show()

In [None]:
df = filter_with_x_tandem(df_)

decoy_ = df[df["decoy"]]['score']
target_ = df[~df["decoy"]]['score']

plt.figure(figsize=(2.363, 2.363)) #60x60

bins = np.linspace(df['score'].min(), 75, 60)
histogram, bins = np.histogram(decoy_, bins=bins, density=True)
histogram_, bins = np.histogram(target_, bins=bins, density=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

plt.plot(bin_centers, histogram_, color=colors[1], label='Target')
plt.fill_between(bin_centers, histogram_, alpha=0.7, color=colors[1])

plt.plot(bin_centers, histogram, color=colors[0], label='Decoy')
plt.fill_between(bin_centers, histogram, alpha=0.7, color=colors[0])

plt.xticks(**hfont)
plt.yticks(**hfont)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
plt.title('X!')
plt.tight_layout()
plt.savefig('figures/04_x_score.pdf')
plt.show()

In [None]:
df = filter_with_ML(df_, cv, features = features)

decoy_ = df[df["decoy"]]['score']
target_ = df[~df["decoy"]]['score']

plt.figure(figsize=(2.363, 2.363)) #60x60

bins = np.linspace(0, 1, 60)
histogram, bins = np.histogram(decoy_, bins=bins, density=True)
histogram_, bins = np.histogram(target_, bins=bins, density=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

plt.plot(bin_centers, histogram_, color=colors[1], label='Target')
plt.fill_between(bin_centers, histogram_, alpha=0.7, color=colors[1])

plt.plot(bin_centers, histogram, color=colors[0], label='Decoy')
plt.fill_between(bin_centers, histogram, alpha=0.7, color=colors[0])

plt.xticks(**hfont)
plt.yticks(**hfont)
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.legend(loc='upper left')
plt.title('ML')
plt.tight_layout()
plt.savefig('figures/04_ml_score.pdf')
plt.show()

In [None]:
df_ml = filter_with_ML(df_, cv, features = features)
df_x = filter_with_x_tandem(df_)

In [None]:
df_ml_fdr = cut_global_fdr(df_ml, analyte_level='precursor')
df_x_fdr = cut_global_fdr(df_x, analyte_level='precursor')

In [None]:
df_ml_fdr = df_ml_fdr.sort_values(by='q_value')
df_ml_fdr = df_ml_fdr.reset_index(drop=True)

df_x_fdr = df_x_fdr.sort_values(by='q_value')
df_x_fdr = df_x_fdr.reset_index(drop=True)

In [None]:
plt.figure(figsize=(2.363, 2.363)) #60x60
plt.plot(df_ml_fdr['q_value'], df_ml_fdr.index, label='ML', color = colors[3])
plt.plot(df_x_fdr['q_value'], df_x_fdr.index, label ='X!', color = colors[0])
plt.xlabel('q-value')
plt.ylabel('Identified PSMs')

plt.yticks([0, 1e4, 2e4, 3e4, 4e4, 5e4, 6e4])
plt.xticks([0.00, 0.005, 0.01])
plt.tight_layout()
plt.legend(loc='lower right')
plt.xticks(**hfont)
plt.yticks(**hfont)
plt.savefig('figures/04_psms.pdf')  
plt.show()

In [None]:
import alphapept.fasta

In [None]:
db = alphapept.fasta.read_database(os.path.join(test_folder,'database.hdf'))

In [None]:
pept_dict = db['pept_dict'].item()
fasta_dict = db['fasta_dict'].item()

In [None]:
df_ml_cp = df_ml.copy()

proteins = df_ml['sequence'].apply(lambda x: [fasta_dict[y]['name'] for y in pept_dict[x]])

list_ = []


for _ in proteins:
    hum = 0
    ara = 0
    for k in _:
        if 'HUM' in k:
            hum +=1
        if 'ARA' in k:
            ara +=1

    if (ara == 0) and (hum > 0):
        list_.append('HUM')
    elif (hum == 0) and (ara > 0):
        list_.append('ARA')
    else:

        list_.append('X')
        
                

In [None]:
df_ml_cp['species'] = list_

In [None]:
sorted_ = df_ml_cp[['species','score','decoy']].sort_values('score')[::-1]

In [None]:
sorted_['HUM_cum'] = np.cumsum(sorted_['species'] == 'HUM')
sorted_['ARA_cum'] = np.cumsum(sorted_['species'] == 'ARA')
sorted_['X_cum'] = np.cumsum(sorted_['species'] == 'X')
sorted_['decoy_cum'] = np.cumsum(sorted_['decoy'])

In [None]:
sorted_['species'].value_counts()

In [None]:
df_ml_fdr_ = df_ml_fdr.copy()

proteins = df_ml_fdr_['sequence'].apply(lambda x: [fasta_dict[y]['name'] for y in pept_dict[x]])

list_ = []

for _ in proteins:
    hum = 0
    ara = 0
    for k in _:
        if 'HUM' in k:
            hum +=1
        if 'ARA' in k:
            ara +=1

    if (ara == 0) and (hum > 0):
        list_.append('HUM')
    elif (hum == 0) and (ara > 0):
        list_.append('ARA')
    else:
        list_.append('X')
        
df_ml_fdr_['species'] = list_ 


df_ml_fdr_['ARA_cum'] = np.cumsum(df_ml_fdr_['species'] == 'ARA')
df_ml_fdr_['ARA_cum_q'] = get_q_values(df_ml_fdr_['ARA_cum'].values/ np.arange(1, len(df_ml_fdr_)+1))

df_x_fdr_ = df_x_fdr.copy()

proteins = df_x_fdr_['sequence'].apply(lambda x: [fasta_dict[y]['name'] for y in pept_dict[x]])

list_ = []

for _ in proteins:
    hum = 0
    ara = 0
    for k in _:
        if 'HUM' in k:
            hum +=1
        if 'ARA' in k:
            ara +=1

    if (ara == 0) and (hum > 0):
        list_.append('HUM')
    elif (hum == 0) and (ara > 0):
        list_.append('ARA')
    else:
        list_.append('X')
        
df_x_fdr_['species'] = list_ 


df_x_fdr_['ARA_cum'] = np.cumsum(df_x_fdr_['species'] == 'ARA')
df_x_fdr_['ARA_cum_q'] = get_q_values(df_x_fdr_['ARA_cum'].values/ np.arange(1, len(df_x_fdr_)+1))

In [None]:
from alphapept.score import get_q_values

In [None]:
plt.figure(figsize=(2.363, 2.363)) #60x60

plt.plot(df_ml_fdr_['ARA_cum_q'], df_ml_fdr_.index, label='ML', color = colors[3])
plt.plot(df_x_fdr_['ARA_cum_q'], df_x_fdr_.index, label='X!', color = colors[0])
#plt.plot(df_x_fdr['q_value'], df_x_fdr.index, label ='X!', color = colors[0])
plt.xlabel('q-value')
plt.ylabel('Identified PSMs')

plt.yticks([0, 1e4, 2e4, 3e4, 4e4, 5e4, 6e4])
plt.xticks([0.00, 0.005, 0.01])
plt.tight_layout()
plt.legend(loc='lower right')
plt.xticks(**hfont)
plt.yticks(**hfont)
plt.savefig('figures/04_q.pdf')  
plt.show()