Install and Import Necessary Libraries

In [31]:
!pip install -q condacolab
import condacolab
condacolab.install()

!conda install -c bioconda anarci

✨🍰✨ Everything looks OK!
Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
Solving environment: - \ | done


    current version: 24.11.3
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [32]:
# Import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from numpy.random import seed

# Import machine learning libraries
import tensorflow as tf

import keras
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import model_from_json, Sequential
from keras.layers import Conv1D, BatchNormalization, Dropout, MaxPooling1D, Flatten, Dense
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Import dataset

In [33]:
dataset = pd.read_csv('/content/DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file
dataset

Unnamed: 0,Name,Heavy_Chain,Light_Chain
0,mAb1,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...
1,mAb2,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...
2,mAb3,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...
3,mAb4,EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...
4,mAb5,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR...
5,mAb6,QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE...,EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL...
6,mAb7,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...
7,mAb8,EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL...,DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK...
8,mAb9,QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG...,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...
9,mAb10,EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL...


In [34]:
name = dataset['Name'].to_list()
Heavy_seq = dataset['Heavy_Chain'].to_list()
Light_seq = dataset['Light_Chain'].to_list()

Convert to Fasta File

In [35]:
file_out='seq_H.fasta'

with open(file_out, "w") as output_handle:
  for i in range(len(name)):
    seq_name = name[i]
    seq = Heavy_seq[i]
    record = SeqRecord(
    Seq(seq),
    id=seq_name,
    name="",
    description="",
    )
    SeqIO.write(record, output_handle, "fasta")

file_out='seq_L.fasta'

with open(file_out, "w") as output_handle:
  for i in range(len(name)):
    seq_name = name[i]
    seq = Light_seq[i]
    record = SeqRecord(
    Seq(seq),
    id=seq_name,
    name="",
    description="",
    )
    SeqIO.write(record, output_handle, "fasta")

Sequence Alignment  with ANARCI

In [36]:
!ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv
!ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv

In [37]:
H_aligned = pd.read_csv('seq_aligned_H.csv')
L_aligned = pd.read_csv('seq_aligned_KL.csv')

In [38]:
# https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py

def seq_preprocessing():
  infile_H = pd.read_csv('seq_aligned_H.csv')
  infile_L = pd.read_csv('seq_aligned_KL.csv')
  outfile = open('seq_aligned_HL.txt', "w")

  H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
                    '11','12','13','14','15','16','17','18','19','20', \
                    '21','22','23','24','25','26','27','28','29','30', \
                    '31','32','33','34','35','36','37','38','39','40', \
                    '41','42','43','44','45','46','47','48','49','50', \
                    '51','52','53','54','55','56','57','58','59','60', \
                    '61','62','63','64','65','66','67','68','69','70', \
                    '71','72','73','74','75','76','77','78','79','80', \
                    '81','82','83','84','85','86','87','88','89','90', \
                    '91','92','93','94','95','96','97','98','99','100', \
                    '101','102','103','104','105','106','107','108','109','110', \
                    '111','111A','111B','111C','111D','111E','111F','111G','111H', \
                    '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\
                    '113','114','115','116','117','118','119','120', \
                    '121','122','123','124','125','126','127','128']

  L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \
                    '11','12','13','14','15','16','17','18','19','20', \
                    '21','22','23','24','25','26','27','28','29','30', \
                    '31','32','33','34','35','36','37','38','39','40', \
                    '41','42','43','44','45','46','47','48','49','50', \
                    '51','52','53','54','55','56','57','58','59','60', \
                    '61','62','63','64','65','66','67','68','69','70', \
                    '71','72','73','74','75','76','77','78','79','80', \
                    '81','82','83','84','85','86','87','88','89','90', \
                    '91','92','93','94','95','96','97','98','99','100', \
                    '101','102','103','104','105','106','107','108','109','110', \
                    '111','112','113','114','115','116','117','118','119','120', \
                    '121','122','123','124','125','126','127']

  H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
          '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
          '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
          '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
          '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
          '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
          '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
          '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
          '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
          '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
          '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
          '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \
          '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \
          '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \
          '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}

  L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \
          '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \
          '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \
          '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \
          '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \
          '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \
          '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \
          '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \
          '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \
          '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \
          '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \
          '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \
          '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}


  N_mAbs = len(infile_H["Id"])

  for i in range(N_mAbs):
    H_tmp = 145*['-']
    L_tmp = 127*['-']
    for col in infile_H.columns:
        if(col in H_inclusion_list):
            H_tmp[H_dict[col]]=infile_H.iloc[i][col]
    for col in infile_L.columns:
        if(col in L_inclusion_list):
            L_tmp[L_dict[col]]=infile_L.iloc[i][col]

    aa_string = ''
    for aa in H_tmp+L_tmp:
         aa_string += aa
    outfile.write(infile_H.iloc[i,0]+" "+aa_string)
    outfile.write("\n")

  outfile.close()
  return

seq_preprocessing()

Read Aligned Sequence

In [39]:
def load_input_data(filename):
    name_list=[]
    seq_list=[]
    with open(filename) as datafile:
        for line in datafile:
            line = line.strip().split()
            name_list.append(line[0])
            seq_list.append(line[1])
    return name_list, seq_list

In [40]:
name_list, seq_list = load_input_data('seq_aligned_HL.txt')
X = seq_list

One Hot Encoding of Aligned Sequence

In [41]:
def one_hot_encoder(s):
    d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}

    x = np.zeros((len(d), len(s)))
    x[[d[c] for c in s], range(len(s))] = 1

    return x

In [42]:
X = [one_hot_encoder(s=x) for x in X]
X = np.transpose(np.asarray(X), (0, 2, 1))
X = np.asarray(X)

Predict DeepSP Predictor

In [43]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Add any other custom layers or classes to this dictionary if necessary
custom_objects = {
    'Sequential': Sequential,
    'Conv1D': Conv1D,
    'BatchNormalization': BatchNormalization,
    'Dropout': Dropout,
    'MaxPooling1D': MaxPooling1D,
    'Flatten': Flatten,
    'Dense': Dense
}

# Function to load and compile a model
def load_and_compile_model(json_path, weights_path):
    # Load model architecture from JSON file
    with open(json_path, 'r') as json_file:
        loaded_model_json = json_file.read()

    # Load model from JSON
    loaded_model = model_from_json(loaded_model_json, custom_objects=custom_objects)

    # Load weights into model
    loaded_model.load_weights(weights_path)

    # Compile the model with optimizer and loss function
    loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])

    return loaded_model

# Assuming X is already defined with your input data

# Load and predict for sap_pos
sap_pos_model = load_and_compile_model('/content/Conv1D_regressionSAPpos.json', '/content/Conv1D_regression_SAPpos.h5')
sap_pos = sap_pos_model.predict(X)

# Load and predict for scm_neg
scm_neg_model = load_and_compile_model('/content/Conv1D_regressionSCMneg.json', '/content/Conv1D_regression_SCMneg.h5')
scm_neg = scm_neg_model.predict(X)

# Load and predict for scm_pos
scm_pos_model = load_and_compile_model('/content/Conv1D_regressionSCMpos.json', '/content/Conv1D_regression_SCMpos.h5')
scm_pos = scm_pos_model.predict(X)

# Now sap_pos, scm_neg, and scm_pos contain the predictions for each respective model


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step


In [44]:
features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv',
          'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv',
          'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv']
df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos),  pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features
df.to_csv('DeepSP_descriptors.csv', index=False)
df

Unnamed: 0,Name,SAP_pos_CDRH1,SAP_pos_CDRH2,SAP_pos_CDRH3,SAP_pos_CDRL1,SAP_pos_CDRL2,SAP_pos_CDRL3,SAP_pos_CDR,SAP_pos_Hv,SAP_pos_Lv,...,SCM_pos_CDRH1,SCM_pos_CDRH2,SCM_pos_CDRH3,SCM_pos_CDRL1,SCM_pos_CDRL2,SCM_pos_CDRL3,SCM_pos_CDR,SCM_pos_Hv,SCM_pos_Lv,SCM_pos_Fv
0,mAb1,2.134782,2.524245,14.445071,1.904739,3.589351,3.17218,27.496794,58.417645,30.517658,...,3.183183,19.583355,29.513481,116.769516,41.759365,55.548294,263.838959,907.113159,1219.444458,2109.088623
1,mAb2,1.84458,4.339113,8.942587,1.613966,10.417938,9.945272,38.409042,58.486866,44.568775,...,27.889273,22.455549,154.7043,23.503662,37.771904,90.647186,360.761566,1224.561279,1132.755249,2335.686523
2,mAb3,2.809426,1.746096,20.808645,0.61797,3.460978,5.031785,34.877117,63.382244,44.274185,...,71.399971,29.845184,41.844193,37.094894,39.442986,22.183485,246.880508,1165.597656,830.064209,1963.784912
3,mAb4,3.139665,0.300689,26.26099,1.863188,0.188,6.295352,38.342857,73.466133,38.451824,...,27.552294,11.260792,56.28307,13.800647,66.964653,50.259041,225.27002,993.915222,1097.456543,2073.500244
4,mAb5,2.489057,0.111881,15.967417,2.553849,0.564824,1.848453,23.765205,61.603031,33.180843,...,42.208302,10.151639,77.493057,142.884079,81.866486,61.183514,415.96048,1010.628784,1221.234863,2209.069336
5,mAb6,9.645025,2.265562,23.064241,6.464194,0.987525,8.052279,49.345421,61.487507,48.577271,...,45.501659,18.139198,136.593094,18.756746,71.840927,66.386719,359.891632,1329.421143,1113.381714,2426.393799
6,mAb7,8.222816,2.838045,11.414648,4.315412,2.673584,5.908562,36.578152,50.778774,44.486637,...,52.311817,74.923622,82.053055,72.280479,45.879494,88.023956,428.916412,1007.111206,848.104553,1808.06604
7,mAb8,4.424241,2.095968,16.019024,2.523245,7.355125,1.797324,34.046429,60.678226,34.210297,...,51.687248,-3.795641,91.028336,14.377373,23.035265,32.922691,198.840347,1157.940308,944.083313,2085.386963
8,mAb9,1.428215,3.449207,5.459098,2.133243,2.771095,8.270719,23.198622,45.440292,37.316959,...,15.40725,8.218231,46.540302,8.149673,17.4743,35.439556,130.615555,1107.602173,954.662292,2039.475342
9,mAb10,2.341982,0.196966,10.534822,4.143444,6.528977,15.2808,40.839794,55.972927,49.084801,...,25.994394,14.516954,26.370365,19.893885,69.955307,26.277239,174.261017,1044.724976,1139.79187,2159.286621
