In [1]:
import pandas as pd
import numpy as np

from gscore.osw.peakgroups import fetch_peak_groups
from gscore.osw.queries import (
    FETCH_SCORED_DATA
)
from gscore.models.denoiser import DenoizingClassifier
from gscore.osw.connection import create_table

from gscore.models.preprocess import STANDARD_SCALAR_PIPELINE
from gscore.models.distributions import build_false_target_protein_distributions, ScoreDistribution

In [2]:
osw_path = '/home/aaron/projects/gscorer/data/openswath/Simon_S1608_050.osw'

In [3]:

peak_groups = fetch_peak_groups(
    host=osw_path, 
    query=FETCH_SCORED_DATA
)


In [4]:
all_peak_groups = peak_groups.select_peak_group(
    return_all=True
)

In [5]:
proteotypic_peptides = peak_groups.select_proteotypic_peptides(
    rerank_keys=['alt_d_score']
)

here


In [6]:
protein_groups = peak_groups.select_protein_groups(
    rerank_keys=['alt_d_score']
)

In [7]:
len(proteotypic_peptides)

81689

In [8]:
highest_ranking = peak_groups.select_peak_group(
    rank=1,
    rerank_keys=['alt_d_score'], 
    ascending=False
)

In [9]:
first_group = highest_ranking

In [10]:
first_group.vote_percentage.value_counts()

0.000    103627
1.000       137
0.002        46
0.004        22
0.006        13
          ...  
0.954         1
0.836         1
0.702         1
0.444         1
0.410         1
Name: vote_percentage, Length: 212, dtype: int64

In [11]:
first_group['peptide_sequence_charge'] = first_group.apply(
    lambda row: '{}_{}'.format(row['peptide_sequence'], row['charge']),
    axis=1
)

In [12]:
cutoff = .85

In [13]:
targets = first_group[
    first_group['vote_percentage'] == 1.0
].copy()

In [14]:
targets = targets.loc[
    targets.peptide_sequence_charge.isin(proteotypic_peptides)
].copy()

In [15]:
decoys = first_group[
    first_group['vote_percentage'] == 0
].copy()

In [16]:
decoys = decoys.loc[
    decoys.peptide_sequence_charge.isin(proteotypic_peptides)
].copy()

In [17]:
model_distribution = build_false_target_protein_distributions(
    targets,
    decoys
)

In [18]:
combined = pd.concat(
    [
        targets,
        decoys
    ]
)

In [19]:
%reload_ext autoreload

from gscore.models.distributions import ScoreDistribution

In [20]:
score_distribution = ScoreDistribution(
    data=model_distribution
)

In [None]:
import matplotlib.pyplot as plt

plt.plot(score_distribution.combined_axis, score_distribution.target_values, lw=2, color='cornflowerblue', linestyle='-')
plt.plot(score_distribution.combined_axis, score_distribution.null_values, lw=2, color='red', linestyle='-')

#plt.savefig(f'{args.input_osw_file}.scoring_model.pdf')

In [22]:
all_peak_groups = peak_groups.select_peak_group(
    return_all=True
)

KeyboardInterrupt: 

In [None]:
all_peak_groups['q_value'] = all_peak_groups['alt_d_score'].apply(
    score_distribution.calc_q_value
)

In [None]:
all_peak_groups.q_value.value_counts()

In [None]:
cutoff = 0.01

In [None]:
all_peak_groups = all_peak_groups.loc[
    all_peak_groups.groupby(['transition_group_id'])['q_value'].idxmin()
]

# Need to fix this if both have 0.0 q value to take the highest scoring

In [None]:
pass_threshold = all_peak_groups[(all_peak_groups['q_value'] <= cutoff)].copy()

In [None]:
print(len(pass_threshold[pass_threshold['target'] == 0.0]), len(pass_threshold[pass_threshold['target'] == 1.0]))

In [None]:
pass_threshold.columns

In [170]:
pass_threshold['peptide_sequence_charge'] = pass_threshold.apply(
    lambda row: '{}_{}'.format(row['peptide_sequence'], row['charge']),
    axis=1
)

In [171]:
proteotypic_counts = pd.DataFrame(
    pass_threshold['peptide_sequence_charge'].value_counts(),
).reset_index()

In [172]:
proteotypic_counts.columns = ['peptide_charge', 'count']

In [173]:
proteotypic_peptides = list(
    proteotypic_counts[
        proteotypic_counts['count'] == 1
    ]['peptide_charge']
)

In [174]:
len(proteotypic_peptides)

12889

In [175]:
len(pass_threshold[pass_threshold['target'] == 1.0].groupby(['protein_accession']))

5600

In [176]:
protein_groups = pass_threshold[pass_threshold['target'] == 1.0].groupby(['protein_accession'])

In [177]:
pg_df = [group for _, group in protein_groups]

In [178]:
protein_groups = list()
for group in pg_df:
    if len(group) >= 2:
        protein_groups.append(group)

In [179]:
len(protein_groups)

2799