# Setup

This notebook demonstrates how to generate predictions using MHCflurry.

In [1]:
# Install the package and download models
!pip install -q mhcflurry
!mhcflurry-downloads --quiet fetch models_class1_presentation

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.8/140.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.7/103.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.9/137.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for typechecks (setup.py) ... [?25l[?25hdone
135MB [00:14, 9.03MB/s]               
Extracting: 100% 62/62 [00:13<00:00,  4.75it/s]


In [2]:
# Imports
import mhcflurry
from google.colab import files

# Quiet warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load a predictor
predictor = mhcflurry.Class1PresentationPredictor.load()
predictor

<Class1PresentationPredictor at 0x79e6f10fee00 [mhcflurry 2.1.4] generated on Thu Jun 11 13:37:18 2020>

# Predict for specified peptides

In [4]:
peptides = """
NLVPMVATV
RANDMPEPTIDE
SIINFEKL
""".split()

alleles = "A*02:01 B*27:01 H2-Kb".split()

results1 = predictor.predict(peptides, alleles)
results1

Predicting processing.


  0%|          | 0/1 [00:00<?, ?it/s]











100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Predicting affinities.


  0%|          | 0/3 [00:00<?, ?it/s]



 33%|███▎      | 1/3 [00:01<00:02,  1.12s/it]



 67%|██████▋   | 2/3 [00:01<00:00,  1.92it/s]



100%|██████████| 3/3 [00:01<00:00,  2.27it/s]


Unnamed: 0,peptide,peptide_num,sample_name,affinity,best_allele,processing_score,presentation_score,presentation_percentile
0,NLVPMVATV,0,sample1,16.570975,A*02:01,0.533008,0.970187,0.018723
1,RANDMPEPTIDE,1,sample1,21780.311674,B*27:01,0.008492,0.004732,62.744674
2,SIINFEKL,2,sample1,19.707211,H2-Kb,0.26471,0.914111,0.099511


In [5]:
# Download results
results1.to_csv('mhcflurry-results.csv')
files.download('mhcflurry-results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# See help for more options:
help(predictor.predict)

Help on method predict in module mhcflurry.class1_presentation_predictor:

predict(peptides, alleles, sample_names=None, n_flanks=None, c_flanks=None, include_affinity_percentile=False, verbose=1, throw=True) method of mhcflurry.class1_presentation_predictor.Class1PresentationPredictor instance
    Predict presentation scores across a set of peptides.
    
    Presentation scores combine predictions for MHC I binding affinity
    and antigen processing.
    
    This method returns a pandas.DataFrame giving presentation scores plus
    the binding affinity and processing predictions and other intermediate
    results.
    
    Example:
    
    >>> predictor = Class1PresentationPredictor.load()
    >>> predictor.predict(
    ...    peptides=["SIINFEKL", "PEPTIDE"],
    ...    n_flanks=["NNN", "SNS"],
    ...    c_flanks=["CCC", "CNC"],
    ...    alleles={
    ...        "sample1": ["A0201", "A0301", "B0702"],
    ...        "sample2": ["A0101", "C0202"],
    ...    },
    ...    verbo

# Predict by scanning across protein sequences

In [7]:
# Paste your fasta here
proteins_fasta = """
>tr|A0A6B9WFC7|A0A6B9WFC7_SARS2 Envelope small membrane protein
MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNIVNVSLVKPSFYVYS
RVKNLNSSRVPDLLV
>tr|A0A6B9W0L4|A0A6B9W0L4_SARS2 ORF6 protein
MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTENKYSQLDEEQPMEI
D
>tr|A0A6G7S6S0|A0A6G7S6S0_SARS2 Nonstructural protein NS3
MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWLIVGVALLAVFQSAS
KIITLKKRWQLALSKGVHFVCNLLLLFVTVYSHLLLVAAGLEAPFLYLYALVYFLQSINF
VRIIMRLWLCWKCRSKNPLLYDANYFLCWHTNCYDYCIPYNSVTSSIVITSGDGTTSPIS
EHDYQIGGYTEKWESGVKDCVVLHSYFTSDYYQLYSTQLSTDTGVEHVTFFIYNKIVDEP
EEHVQIHTIDGSSGVVNPVMEPIYDEPTTTTSVPL
>tr|A0A6B9VLF3|A0A6B9VLF3_SARS2 Membrane protein
MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPV
TLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILL
NVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYK
LGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIALLVQ
"""

import mhcflurry.fasta

with open("temp.fa", "w") as fd:
    fd.write(proteins_fasta)

proteins = mhcflurry.fasta.read_fasta_to_dataframe("temp.fa").set_index("sequence_id")
proteins

Unnamed: 0_level_0,sequence
sequence_id,Unnamed: 1_level_1
tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,MYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALRLCAYCCNI...
tr|A0A6B9W0L4|A0A6B9W0L4_SARS2,MFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSLTE...
tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,MDLFMRIFTIGTVTLKQGEIKDATPSDFVRATATIPIQASLPFGWL...
tr|A0A6B9VLF3|A0A6B9VLF3_SARS2,MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFL...


In [8]:
# Define alleles for each sample
alleles={
    "my-sample": ["A0201", "A0301", "B0702", "C0802"],
}

In [9]:
# Predict across protein sequences and return peptides with predicted affinity
# less than 500 nM.
results2 = predictor.predict_sequences(
    sequences=proteins.sequence.to_dict(),
    alleles=alleles,
    result="filtered",
    comparison_quantity="affinity",
    filter_value=500)
results2

Predicting processing.


  0%|          | 0/1 [00:00<?, ?it/s]



100%|██████████| 1/1 [00:07<00:00,  7.13s/it]


Predicting affinities.


  0%|          | 0/4 [00:00<?, ?it/s]



 25%|██▌       | 1/4 [00:00<00:00,  3.35it/s]



 50%|█████     | 2/4 [00:00<00:00,  3.36it/s]



 75%|███████▌  | 3/4 [00:01<00:00,  2.79it/s]



100%|██████████| 4/4 [00:01<00:00,  3.14it/s]


Unnamed: 0,sequence_name,pos,peptide,n_flank,c_flank,sample_name,affinity,best_allele,affinity_percentile,processing_score,presentation_score,presentation_percentile
0,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,138,LLYDANYFL,RSKNP,CWHTN,my-sample,10.659106,A0201,0.003625,0.157176,0.921852,0.088804
1,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,106,YLYALVYFL,EAPFL,QSINF,my-sample,11.053785,A0201,0.006750,0.014756,0.868851,0.171848
2,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,71,ALSKGVHFV,KRWQL,CNLLL,my-sample,11.501205,A0201,0.011500,0.676803,0.987502,0.002065
3,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,49,SLVKPSFYV,NIVNV,YSRVK,my-sample,11.930824,A0201,0.013500,0.091771,0.891807,0.135353
4,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,19,FLAFVVFLL,NSVLL,VTLAI,my-sample,12.318483,A0201,0.015875,0.007210,0.852791,0.196277
...,...,...,...,...,...,...,...,...,...,...,...,...
188,tr|A0A6B9WFC7|A0A6B9WFC7_SARS2,15,SVLLFLAFVV,TLIVN,FLLVT,my-sample,466.913118,A0201,1.297625,0.010083,0.145307,2.271005
189,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,57,SASKIITL,LAVFQ,KKRWQ,my-sample,471.300165,C0802,0.774375,0.772850,0.753873,0.351359
190,tr|A0A6G7S6S0|A0A6G7S6S0_SARS2,169,TSGDGTTSPI,SSIVI,SEHDY,my-sample,473.865570,C0802,0.774375,0.000247,0.138992,2.345462
191,tr|A0A6B9VLF3|A0A6B9VLF3_SARS2,71,RINWITGGI,LAAVY,AIAMA,my-sample,475.852765,A0201,1.306500,0.166254,0.232094,1.656413


In [10]:
# Download results
results2.to_csv('mhcflurry-results.csv')
files.download('mhcflurry-results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# See help for more options:
help(predictor.predict_sequences)

Help on method predict_sequences in module mhcflurry.class1_presentation_predictor:

predict_sequences(sequences, alleles, result='best', comparison_quantity=None, filter_value=None, peptide_lengths=(8, 9, 10, 11), use_flanks=True, include_affinity_percentile=True, verbose=1, throw=True) method of mhcflurry.class1_presentation_predictor.Class1PresentationPredictor instance
    Predict presentation across protein sequences.
    
    Example:
    
    >>> predictor = Class1PresentationPredictor.load()
    >>> predictor.predict_sequences(
    ...    sequences={
    ...        'protein1': "MDSKGSSQKGSRLLLLLVVSNLL",
    ...        'protein2': "SSLPTPEDKEQAQQTHH",
    ...    },
    ...    alleles={
    ...        "sample1": ["A0201", "A0301", "B0702"],
    ...        "sample2": ["A0101", "C0202"],
    ...    },
    ...    result="filtered",
    ...    comparison_quantity="affinity",
    ...    filter_value=500,
    ...    verbose=0)
      sequence_name  pos     peptide n_flank c_flank sample