<a href="https://colab.research.google.com/github/jalew188/PeptDeep-HLA/blob/master/nbs/HLA1_transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transfer learning of sample-specific HLA-I models

> To enable GPU in colab, click `Runtime -> Change runtime type`.

In [1]:
%pip install -q git+https://github.com/MannLabs/PeptDeep-HLA.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m555.0/555.0 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.1/205.1 KB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 KB[0m [31m5.3 MB/s[0m eta [36m0:0

In [2]:
%pip install -q gdown

In [3]:
#@title Upload your fasta files for HLA peptide prediction
from google.colab import files
uploaded_fasta = files.upload()

Saving human.fasta to human (1).fasta


In [4]:
import gdown
import torch
if len(uploaded_fasta) > 0:
  fasta_files = list(uploaded_fasta.keys())
else:
  if torch.cuda.is_available():
    # human fasta
    fasta_url = "https://drive.google.com/file/d/1V9KxDniKwZFZnHlP58EbjkuelNJnp1Kq/view?usp=share_link"
    fasta = 'UP000005640_human_reviewed.fasta'
  else:
    # no GPU runtime in colab, use irt fusion peptides for testing
    fasta_url = "https://drive.google.com/file/d/1MKGRBpzvmMW0l_hdPESo3j26EWd_yi8l/view?usp=share_link"
    fasta = 'irtfusion.fasta'
  gdown.download(fasta_url, fasta, fuzzy=True)
  fasta_files = [fasta]

#### Load training HLA peptides

It can be a tsv/txt file containing sample-specific HLA-I peptides in the 'sequence' column.

Click the `Files` (folder logo) in the left panel of Colab and upload files.

In [5]:
#@title Upload your HLA peptide file for transfer learning
HLA_sequence_file = files.upload()

Saving HLA_sequences_DIA-Umpire_RA957.tsv to HLA_sequences_DIA-Umpire_RA957.tsv


In [13]:
import pandas as pd

if len(HLA_sequence_file) > 0:
  hla_seq_df = pd.read_table(list(HLA_sequence_file.keys())[0])
else:
  hla_seq_df = pd.DataFrame({
    'sequence': [
        'ACDEFGHIKLMNPQ',
        'ACDEFGHI',
        'ACDEFGHIK',
        'EFGHIKLMNPQ',
        'AHIKLMNPQ',
    ]
  })
hla_seq_df['nAA'] = hla_seq_df.sequence.str.len()
hla_seq_df

Unnamed: 0,sequence,nAA
0,ETQGQQPPQR,10
1,ESAPEGQAQQR,11
2,NRNDQEATL,9
3,EHVKEVQQL,9
4,NHHLQETSF,9
...,...,...
15982,EHMELVSRL,9
15983,VTPQIDSSRI,10
15984,MPVSELTDKL,10
15985,IPISHIDDVL,10


In [14]:
test_seq_df = hla_seq_df.sample(frac=0.2)
train_seq_df = hla_seq_df.drop(index=test_seq_df.index)
len(train_seq_df), len(test_seq_df)

(12790, 3197)

#### Initialize the model

In [15]:
from peptdeep_hla.HLA_class_I import HLA_Class_I_Classifier
model = HLA_Class_I_Classifier(
    fasta_files=fasta_files
)
model.get_parameter_num()

1669697

#### Load the pretrained model

In [16]:
from peptdeep_hla.HLA_class_I import pretrained_HLA1
model.load(pretrained_HLA1)
pretrained_HLA1

'/usr/local/lib/python3.8/dist-packages/peptdeep_hla/pretrained_models/HLA1_IEDB.pt'

#### Transfer learning with the training peptides

The non-HLA peptides are automatically sampled from the fasta file as the negative training data.

In [17]:
model.train(
    train_seq_df, 
    epoch=40, warmup_epoch=10, 
    verbose=True
)

Training with padding zero sequences: True
[Training] Epoch=1, lr=1e-05, loss=0.29640439711511135
[Training] Epoch=2, lr=2e-05, loss=0.28878425993025303
[Training] Epoch=3, lr=3e-05, loss=0.24793076515197754
[Training] Epoch=4, lr=4e-05, loss=0.22147746197879314
[Training] Epoch=5, lr=5e-05, loss=0.20291892532259226
[Training] Epoch=6, lr=6e-05, loss=0.19167967978864908
[Training] Epoch=7, lr=7e-05, loss=0.17607212904840708
[Training] Epoch=8, lr=8e-05, loss=0.17061122227460146
[Training] Epoch=9, lr=9e-05, loss=0.1611229795962572
[Training] Epoch=10, lr=0.0001, loss=0.15486051933839917
[Training] Epoch=11, lr=9.972609476841367e-05, loss=0.14656726131215692
[Training] Epoch=12, lr=9.890738003669029e-05, loss=0.14579360093921423
[Training] Epoch=13, lr=9.755282581475769e-05, loss=0.13253016350790858
[Training] Epoch=14, lr=9.567727288213005e-05, loss=0.12391459662467241
[Training] Epoch=15, lr=9.330127018922194e-05, loss=0.12131089763715863
[Training] Epoch=16, lr=9.045084971874738e-05,

Testing

In [33]:
from peptdeep_hla.utils import get_random_sequences

def concat_neg_df(pos_df, prot_df, column_to_train='HLA'):
    pos_df[column_to_train] = 1
    df_list = [pos_df]
    for nAA, group_df in pos_df.groupby('nAA'):
        rnd_seqs = get_random_sequences(
            prot_df, 
            n=len(group_df),
            pep_len = nAA
        )
        df_list.append(pd.DataFrame(
            {'sequence':rnd_seqs,'nAA':nAA,column_to_train:0}
        ))
    return pd.concat(df_list).reset_index(drop=True)

def test(df):
  df = concat_neg_df(df, model.protein_df)
  model.predict(df)
  prob_list = []
  precision_list = []
  recall_list = []
  fp_list = []
  for prob in [0.5,0.6,0.7,0.8, 0.9]:
    prob_list.append(prob)
    precision_list.append(df[df.HLA_prob_pred>prob].HLA.mean())
    recall_list.append(df[df.HLA_prob_pred>prob].HLA.sum()/len(df)*2)
    fp_list.append(1-(1-df[df.HLA_prob_pred<prob].HLA).sum()/len(df)*2)
  return pd.DataFrame(dict(
    HLA_prob_pred=prob_list,
    precision=precision_list,
    recall=recall_list,
    false_positive=fp_list
  ))

In [34]:
test(train_seq_df)

Unnamed: 0,HLA_prob_pred,precision,recall,false_positive
0,0.5,0.959595,0.986005,0.041517
1,0.6,0.96491,0.980375,0.035653
2,0.7,0.97054,0.973651,0.029554
3,0.8,0.976175,0.957858,0.023378
4,0.9,0.983272,0.914543,0.015559


In [35]:
test(test_seq_df)

Unnamed: 0,HLA_prob_pred,precision,recall,false_positive
0,0.5,0.957881,0.939005,0.041289
1,0.6,0.962038,0.927432,0.036597
2,0.7,0.96627,0.913982,0.031905
3,0.8,0.971216,0.897091,0.026587
4,0.9,0.978648,0.860181,0.018768


#### Predict HLA-I peptides from fasta

In [29]:
hla_df = model.predict_from_proteins(prob_threshold=0.7)
hla_df

  lcp_array = kasai(cat_prot, suffix_array)
100%|██████████| 72/72 [56:08<00:00, 46.78s/it]


Unnamed: 0,start_pos,end_pos,nAA,HLA_prob_pred,sequence
0,3217635,3217643,8,0.722687,HQFHEEMI
1,3217601,3217609,8,0.881847,KYSTDVKL
2,9414645,9414653,8,0.792873,KGPENPQV
3,3217603,3217611,8,0.937689,STDVKLSL
4,3217498,3217506,8,0.928157,ADSVANKL
...,...,...,...,...,...
1994721,2360880,2360894,14,0.797666,LLEEEKKQMEHVQR
1994722,5161777,5161791,14,0.932791,FLFDFQKTGPPLVG
1994723,2360761,2360775,14,0.993378,RPMYAHHISSKYDE
1994724,2360760,2360774,14,0.748553,SRPMYAHHISSKYD


In [30]:
hla_df[['sequence','HLA_prob_pred']].to_csv('Predicted_HLA.tsv',index=False, sep="\t")

To download `Predicted_HLA.tsv` when using Colab, click the `Files` (folder logo) in the left panel and right-click the file to download.

In [31]:
#@title Download Predicted_HLA.tsv
from google.colab import files
files.download(f'Predicted_HLA.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
#@title Download transfer learning model
from google.colab import files
model.save('transfer_HLA.pt')
files.download(f'transfer_HLA.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>