In [45]:
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
# path names
root = Path().resolve()
path = root/'dataset'/'Henry_Mpro_pKa'

In [4]:
# the function to read data
def read_data(name):
    table_data = pd.DataFrame({})
    dataset_path = path/name
    for dp in dataset_path.iterdir():
        # structure.pdb does not need to be read here
        if dp.name != 'structure.pdb':
            temp_table = pd.read_table(dp, header=None, names=['#1', '#2'])
            temp_array = (list)(temp_table['#2'])
            table_data[dp.name] = temp_array
            warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    return table_data

In [5]:
S01_table = read_data(name='S01')
S05_table = read_data(name='S05')

S01_index = list(S01_table)
S05_index = list(S05_table)

same_residues = [x for x in S01_index if x in S05_index]

S01_filter = S01_table[same_residues]
S05_filter = S05_table[same_residues]

In [6]:
def sort_residues(data):
    data_t = data.T
    res_lists = list(data_t.index.values)
    data_t["order"] = None

    for i in range(len(res_lists)):
        tmp_idx = res_lists[i]
        tmp_val = int(tmp_idx[4:])
        data_t.loc[tmp_idx, "order"] = tmp_val

    data_sort = data_t.sort_values(by="order")
    sort_data = data_sort.T
    serial_data = sort_data.drop("order")
    return serial_data


S01_serial = sort_residues(S01_filter)
S05_serial = sort_residues(S05_filter)

In [9]:
S01_corr_matrix = S01_serial.corr(method="pearson")
S05_corr_matrix = S05_serial.corr(method="pearson")

S01_S05_diff = S01_corr_matrix.sub(S05_corr_matrix)

S01_S05_matrix = np.array(S01_S05_diff)

In [10]:
serial_res = list(S01_S05_diff.index.values)


def extract_high_value(matrix, threshold):
    filter_data = pd.DataFrame(columns=["Residue_1", "Residue_2", "Difference"])

    for i in range(len(matrix)-1):
        temp_list = matrix[i][i+1:]
        for j in range(len(temp_list)):
            if abs(temp_list[j]) > threshold:
                res_1 = serial_res[i]
                res_2 = serial_res[i+j+1]
                filter_data.loc[len(filter_data)] = [res_1, res_2, temp_list[j]]

    return filter_data


S01_S05_filter = extract_high_value(S01_S05_matrix, 0.5)

In [11]:
S01_S05_filter

Unnamed: 0,Residue_1,Residue_2,Difference
0,ASP_295,ARG_366,0.589254
1,ASP_354,TYR_360,-0.812065
2,TYR_360,ARG_366,0.887981
3,TYR_360,ARG_494,0.644191


In [26]:
diff_residues = ["ASP_295", "ASP_354", "TYR_360", "ARG_366", "ARG_494"]
S01_data = S01_table[diff_residues]
S05_data = S05_table[diff_residues]

S01_data.insert(S01_data.shape[1], 'Substrate', 'S01')
S05_data.insert(S05_data.shape[1], 'Substrate', 'S05')
merge_data = pd.concat([S01_data, S05_data], ignore_index=True)

In [33]:
x = merge_data.iloc[:, 0:5]
y = merge_data["Substrate"]

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [38]:
estimator = LogisticRegression()
estimator.fit(x_train, y_train)

In [39]:
estimator.coef_

array([[-0.57969535, -0.64038782, -0.80696387, -0.07655472,  0.43989683]])

In [40]:
estimator.intercept_

array([10.01493588])

In [43]:
y_predict = estimator.predict(x_test)
print("The prediction of logistic regression: \n", y_predict)
print("The comparison between the prediction and actual results: \n", y_test == y_predict)

The prediction of logistic regression: 
 ['S05' 'S01' 'S01' 'S05' 'S05' 'S01' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01'
 'S01' 'S01' 'S01' 'S01' 'S01' 'S01' 'S05' 'S05' 'S01' 'S05' 'S01' 'S05'
 'S01' 'S01' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01' 'S01' 'S05' 'S05' 'S01'
 'S05' 'S05' 'S05' 'S01' 'S01' 'S01' 'S05' 'S05' 'S01' 'S05' 'S01' 'S05'
 'S01' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01' 'S01' 'S01' 'S05' 'S05' 'S01'
 'S05' 'S01' 'S05' 'S01' 'S01' 'S05' 'S05' 'S05' 'S01' 'S05' 'S01' 'S01'
 'S05' 'S01' 'S01' 'S01' 'S05' 'S01' 'S05' 'S01' 'S05' 'S01' 'S01' 'S05'
 'S01' 'S01' 'S01' 'S01' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01' 'S01' 'S05'
 'S05' 'S05' 'S01' 'S05' 'S01' 'S01' 'S05' 'S05' 'S01' 'S01' 'S05' 'S05'
 'S01' 'S05' 'S01' 'S01' 'S01' 'S05' 'S01' 'S01' 'S05' 'S05' 'S01' 'S01'
 'S01' 'S05' 'S05' 'S05' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01' 'S01' 'S05'
 'S01' 'S01' 'S05' 'S01' 'S01' 'S05' 'S05' 'S05' 'S05' 'S01' 'S01' 'S05'
 'S01' 'S01' 'S01' 'S05' 'S01' 'S01' 'S01' 'S05' 'S01' 'S05' 'S05' 'S05'
 'S01' 'S0

In [44]:
score = estimator.score(x_test, y_test)
print("The accuracy is: ", score)

The accuracy is:  0.627906976744186


In [46]:
transfer = StandardScaler()
x_train_ss = transfer.fit_transform(x_train)
x_test_ss = transfer.transform(x_test)

In [47]:
x_train_ss

array([[ 0.66488551,  0.26234845,  0.2065311 ,  0.00814087,  1.46786599],
       [-1.23970534,  0.21431758,  0.02576121, -1.31585287, -0.70608503],
       [ 1.27707543,  0.11825582, -0.76962628, -1.501212  ,  0.1997279 ],
       ...,
       [ 1.30428387,  0.9988219 , -0.49847145, -0.03157894,  0.38089048],
       [ 0.01188293, -0.00982652,  0.92057214, -0.66709594, -0.6456975 ],
       [ 2.20216241, -0.16992944, -0.18212415, -0.21693806,  0.13934037]])

In [48]:
ss_estimator = LogisticRegression()
ss_estimator.fit(x_train_ss, y_train)

In [49]:
ss_estimator.coef_

array([[-0.43284546, -0.40739699, -0.8831407 , -0.06303534,  0.10851786]])

In [50]:
ss_estimator.intercept_

array([0.17833053])

In [51]:
y_predict_ss = estimator.predict(x_test_ss)
print("The results after standard scaled:\n")
print("The prediction of logistic regression: \n", y_predict_ss)
print("The comparison between the prediction and actual results: \n", y_test == y_predict_ss)

The results after standard scaled:

The prediction of logistic regression: 
 ['S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05'
 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05' 'S05



In [52]:
ss_score = estimator.score(x_test_ss, y_test)
print("The accuracy is: ", score)

The accuracy is:  0.627906976744186


