Install and Import Necessary Libraries

In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

!conda install -c bioconda anarci

[0m✨🍰✨ Everything looks OK!
Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ 

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from numpy.random import seed

# Import machine learning libraries
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.utils import plot_model

import keras
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Import dataset

In [4]:
dataset = pd.read_csv('DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file
dataset

Unnamed: 0,Name,Heavy_Chain,Light_Chain
0,mAb1,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...
1,mAb2,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...
2,mAb3,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...
3,mAb4,EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...
4,mAb5,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR...
5,mAb6,QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE...,EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL...
6,mAb7,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...
7,mAb8,EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL...,DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK...
8,mAb9,QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG...,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...
9,mAb10,EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL...


In [5]:
name = dataset['Name'].to_list()
Heavy_seq = dataset['Heavy_Chain'].to_list()
Light_seq = dataset['Light_Chain'].to_list()

Convert to Fasta File

In [6]:
file_out='seq_H.fasta'

with open(file_out, "w") as output_handle:
  for i in range(len(name)):
    seq_name = name[i]
    seq = Heavy_seq[i]
    record = SeqRecord(
    Seq(seq),
    id=seq_name,
    name="",
    description="",
    )
    SeqIO.write(record, output_handle, "fasta")

file_out='seq_L.fasta'

with open(file_out, "w") as output_handle:
  for i in range(len(name)):
    seq_name = name[i]
    seq = Light_seq[i]
    record = SeqRecord(
    Seq(seq),
    id=seq_name,
    name="",
    description="",
    )
    SeqIO.write(record, output_handle, "fasta")

Sequence Alignment  with ANARCI

In [7]:
!ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv
!ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv

In [8]:
H_aligned = pd.read_csv('seq_aligned_H.csv')
L_aligned = pd.read_csv('seq_aligned_KL.csv')

In [9]:
# https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py

def seq_preprocessing():
  infile_H = pd.read_csv('seq_aligned_H.csv')
  infile_L = pd.read_csv('seq_aligned_KL.csv')
  outfile = open('seq_aligned_HL.txt', "w")

  H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
                    '11','12','13','14','15','16','17','18','19','20', \
                    '21','22','23','24','25','26','27','28','29','30', \
                    '31','32','33','34','35','36','37','38','39','40', \
                    '41','42','43','44','45','46','47','48','49','50', \
                    '51','52','53','54','55','56','57','58','59','60', \
                    '61','62','63','64','65','66','67','68','69','70', \
                    '71','72','73','74','75','76','77','78','79','80', \
                    '81','82','83','84','85','86','87','88','89','90', \
                    '91','92','93','94','95','96','97','98','99','100', \
                    '101','102','103','104','105','106','107','108','109','110', \
                    '111','111A','111B','111C','111D','111E','111F','111G','111H', \
                    '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\
                    '113','114','115','116','117','118','119','120', \
                    '121','122','123','124','125','126','127','128']

  L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
                    '11','12','13','14','15','16','17','18','19','20', \
                    '21','22','23','24','25','26','27','28','29','30', \
                    '31','32','33','34','35','36','37','38','39','40', \
                    '41','42','43','44','45','46','47','48','49','50', \
                    '51','52','53','54','55','56','57','58','59','60', \
                    '61','62','63','64','65','66','67','68','69','70', \
                    '71','72','73','74','75','76','77','78','79','80', \
                    '81','82','83','84','85','86','87','88','89','90', \
                    '91','92','93','94','95','96','97','98','99','100', \
                    '101','102','103','104','105','106','107','108','109','110', \
                    '111','112','113','114','115','116','117','118','119','120', \
                    '121','122','123','124','125','126','127']

  H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
          '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
          '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
          '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
          '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
          '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
          '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
          '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
          '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
          '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
          '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
          '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \
          '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \
          '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \
          '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}

  L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
          '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
          '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
          '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
          '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
          '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
          '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
          '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
          '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
          '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
          '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
          '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \
          '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}


  N_mAbs = len(infile_H["Id"])

  for i in range(N_mAbs):
    H_tmp = 145*['-']
    L_tmp = 127*['-']
    for col in infile_H.columns:
        if(col in H_inclusion_list):
            H_tmp[H_dict[col]]=infile_H.iloc[i][col]
    for col in infile_L.columns:
        if(col in L_inclusion_list):
            L_tmp[L_dict[col]]=infile_L.iloc[i][col]

    aa_string = ''
    for aa in H_tmp+L_tmp:
         aa_string += aa
    outfile.write(infile_H.iloc[i,0]+" "+aa_string)
    outfile.write("\n")

  outfile.close()
  return

seq_preprocessing()

Read Aligned Sequence

In [10]:
def load_input_data(filename):
    name_list=[]
    seq_list=[]
    with open(filename) as datafile:
        for line in datafile:
            line = line.strip().split()
            name_list.append(line[0])
            seq_list.append(line[1])
    return name_list, seq_list

In [11]:
name_list, seq_list = load_input_data('seq_aligned_HL.txt')
X = seq_list

One Hot Encoding of Aligned Sequence

In [12]:
def one_hot_encoder(s):
    d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}

    x = np.zeros((len(d), len(s)))
    x[[d[c] for c in s], range(len(s))] = 1

    return x

In [13]:
X = [one_hot_encoder(s=x) for x in X]
X = np.transpose(np.asarray(X), (0, 2, 1))
X = np.asarray(X)

Predict DeepSP Predictor

In [14]:
# sappos
json_file = open('Conv1D_regressionSAPpos.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into model
loaded_model.load_weights("Conv1D_regression_SAPpos.h5")
loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
sap_pos = loaded_model.predict(X)

# scmneg
json_file = open('Conv1D_regressionSCMneg.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into model
loaded_model.load_weights("Conv1D_regression_SCMneg.h5")
loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
scm_neg = loaded_model.predict(X)

# scmpos
json_file = open('Conv1D_regressionSCMpos.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into model
loaded_model.load_weights("Conv1D_regression_SCMpos.h5")
loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])
scm_pos = loaded_model.predict(X)



In [15]:
features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv',
          'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv',
          'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv']
df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos),  pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features
df.to_csv('DeepSP_descriptors.csv', index=False)
df

Unnamed: 0,Name,SAP_pos_CDRH1,SAP_pos_CDRH2,SAP_pos_CDRH3,SAP_pos_CDRL1,SAP_pos_CDRL2,SAP_pos_CDRL3,SAP_pos_CDR,SAP_pos_Hv,SAP_pos_Lv,...,SCM_pos_CDRH1,SCM_pos_CDRH2,SCM_pos_CDRH3,SCM_pos_CDRL1,SCM_pos_CDRL2,SCM_pos_CDRL3,SCM_pos_CDR,SCM_pos_Hv,SCM_pos_Lv,SCM_pos_Fv
0,mAb1,2.134783,2.524245,14.445071,1.90474,3.589351,3.172182,27.496794,58.417648,30.517651,...,3.183182,19.583357,29.513483,116.769501,41.759361,55.548267,263.838928,907.113037,1219.444458,2109.088623
1,mAb2,1.844576,4.339117,8.942592,1.613968,10.41794,9.945271,38.409042,58.486862,44.568775,...,27.889265,22.455563,154.7043,23.503654,37.771931,90.647194,360.761566,1224.561279,1132.755249,2335.686523
2,mAb3,2.809425,1.746096,20.808647,0.617971,3.460975,5.031782,34.877113,63.382252,44.274189,...,71.399979,29.845194,41.844185,37.094913,39.442978,22.183475,246.880508,1165.597656,830.064209,1963.784912
3,mAb4,3.139667,0.300687,26.26099,1.863188,0.188002,6.295355,38.342857,73.466133,38.451836,...,27.552292,11.260792,56.283066,13.800642,66.964668,50.259052,225.27002,993.915222,1097.456543,2073.500244
4,mAb5,2.489059,0.111882,15.967413,2.553848,0.564827,1.848455,23.765207,61.603027,33.180843,...,42.208309,10.151648,77.493065,142.884079,81.866493,61.183517,415.96048,1010.628784,1221.234863,2209.069336
5,mAb6,9.645024,2.265561,23.064241,6.464199,0.987525,8.052277,49.345421,61.487507,48.577274,...,45.501659,18.139217,136.593094,18.756721,71.840881,66.386726,359.891602,1329.421021,1113.381714,2426.393555
6,mAb7,8.222817,2.838049,11.414651,4.315411,2.673587,5.908565,36.578148,50.778774,44.486637,...,52.311821,74.923622,82.05307,72.280479,45.879509,88.023949,428.916351,1007.111145,848.104492,1808.06604
7,mAb8,4.424239,2.095972,16.019022,2.523247,7.355127,1.797327,34.046429,60.678223,34.210297,...,51.687237,-3.795643,91.028336,14.377379,23.035267,32.922684,198.840347,1157.940308,944.083252,2085.386963
8,mAb9,1.428219,3.449205,5.459098,2.133244,2.771093,8.270716,23.198622,45.440292,37.316959,...,15.407248,8.21823,46.540298,8.14966,17.474276,35.439575,130.615524,1107.602173,954.662292,2039.47522
9,mAb10,2.341985,0.196968,10.534814,4.143441,6.528968,15.280798,40.839794,55.972923,49.084797,...,25.994398,14.516958,26.370356,19.893885,69.955307,26.277246,174.261017,1044.724976,1139.79187,2159.286621
