In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
import math
import sys
import re
import statistics
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import lightgbm as lgb

In [2]:
def predict(data_x, data_y, test_x):
    train_x, valid_x, train_y, valid_y = train_test_split(data_x, data_y, test_size=0.20, random_state=0)
    lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'verbose': -1
    }
    lgb_train = lgb.Dataset(train_x, train_y, params={'verbose': -1})
    lgb_eval = lgb.Dataset(valid_x, valid_y, params={'verbose': -1})
    evals_result = {}
    gbm = lgb.train(params=lgbm_params,
                    train_set=lgb_train, 
                    valid_sets=[lgb_eval], 
                    early_stopping_rounds=10, 
                    evals_result=evals_result, 
                    verbose_eval=False);

    return gbm.predict(test_x)

In [3]:
df = pd.read_csv('data/DDG_Dataset.csv')
df = df[df['Temperature'] != "'-"]
ndatas = df.shape[0]
print(df.columns)
df

Index(['Rowid', 'ID', 'Protein Name', 'Mutation', 'Chain', 'PDB',
       'Temperature', 'pH', 'ΔΔG', 'Reference', 'A', 'R', 'N', 'D', 'C', 'Q',
       'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V',
       'ASP', 'PHE', 'GLN', 'LYS', 'ILE', 'TYR', 'GLY', 'ASN', 'ARG', 'LEU',
       'TRP', 'ALA', 'THR', 'VAL', 'HIS', 'CYS', 'GLU', 'MET', 'PRO', 'SER'],
      dtype='object')


Unnamed: 0,Rowid,ID,Protein Name,Mutation,Chain,PDB,Temperature,pH,ΔΔG,Reference,...,TRP,ALA,THR,VAL,HIS,CYS,GLU,MET,PRO,SER
0,1,1,Tryptophan synthase alpha chain,E49M,A,1WQ5,298.95,7.0,4.60,PMID: 378988,...,0.0,4.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0
1,2,2,Tryptophan synthase alpha chain,E49Q,A,1WQ5,298.95,7.0,-2.50,PMID: 378988,...,0.0,4.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0
2,3,3,Endolysin,W138Y,A,2LZM,298.15,2.2,-1.71,PMID: 911878,...,0.0,5.0,4.0,4.0,0.0,0.0,2.0,2.0,1.0,2.0
3,4,6,Cellular tumor antigen p53,C242S,A,1TUP,283.15,7.2,-3.07,PMID: 1203434,...,0.0,0.0,0.0,1.0,2.0,4.0,2.0,3.0,1.0,2.0
4,5,7,Cellular tumor antigen p53,F134L,A,1TUP,283.15,7.2,-4.78,PMID: 1203434,...,0.0,2.0,5.0,3.0,0.0,5.0,4.0,1.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569,8570,13318,Transcriptional repressor arc,V25A,A,1ARR,298.15,7.5,-0.40,PMID: 7664079,...,0.0,1.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,1.0
8570,8571,13319,Transcriptional repressor arc,V18A,A,1ARR,298.15,7.5,-0.50,PMID: 7664079,...,1.0,1.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,0.0
8571,8572,13320,Transcriptional repressor arc,V33A,A,1ARR,298.15,7.5,-2.10,PMID: 7664079,...,1.0,1.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,2.0
8572,8573,13321,Transcriptional repressor arc,S32A,A,1ARR,298.15,7.5,-3.80,PMID: 7664079,...,0.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,1.0


In [4]:
# df['Temperature'] = df['Temperature'].astype('float64')
df['Temperature'] = df['Temperature'].astype('float64')
# print(df['Temperature'].unique())
data_x = df.drop(columns=['Rowid', 'ID', 'Protein Name', 'Mutation', 'Chain', 'PDB', 'ΔΔG', 'Reference']).to_numpy()
data_y = df['ΔΔG'].to_numpy()

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=1234)
pred_y = np.zeros(ndatas)
for index, (train_indices, valid_indices) in enumerate(kf.split(range(ndatas))):
    print(index)
    print(train_indices)
    print(valid_indices)
    train_x, test_x = data_x[train_indices], data_x[valid_indices] 
    train_y = data_y[train_indices]
    pred_y[valid_indices] = predict(train_x, train_y, test_x)

0
[   0    3    4 ... 8532 8533 8534]
[   1    2    5 ... 8525 8530 8531]
1
[   1    2    3 ... 8531 8532 8534]
[   0    8    9 ... 8502 8511 8533]
2
[   0    1    2 ... 8531 8532 8533]
[  12   20   27 ... 8522 8527 8534]
3
[   0    1    2 ... 8532 8533 8534]
[   4   13   17 ... 8526 8528 8529]
4
[   0    1    2 ... 8531 8533 8534]
[   3    7   10 ... 8521 8524 8532]


In [6]:
df['pred_ΔΔG'] = pred_y
df.to_csv('data/pred_DDG_Dataset_lightGBM.csv')

In [7]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(df['ΔΔG'], df['pred_ΔΔG']))

2.501240674852361
