In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import warnings

In [2]:
# path names
root = Path().resolve()
path = root/'dataset'/'Henry_Mpro_pKa'

In [3]:
# the function to read data
def read_data(name):
    table_data = pd.DataFrame({})
    dataset_path = path/name
    for dp in dataset_path.iterdir():
        # structure.pdb does not need to be read here
        if dp.name != 'structure.pdb':
            temp_table = pd.read_table(dp, header=None, names=['#1', '#2'])
            temp_array = (list)(temp_table['#2'])
            table_data[dp.name] = temp_array
            warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    return table_data

In [4]:
S01_table = read_data(name='S01')
S02_table = read_data(name='S02')
S03_table = read_data(name='S03')
S04_table = read_data(name='S04')
S05_table = read_data(name='S05')
S06_table = read_data(name='S06')
S07_table = read_data(name='S07')
S08_table = read_data(name='S08')
S09_table = read_data(name='S09')
S10_table = read_data(name='S10')
S11_table = read_data(name='S11')
p12_table = read_data(name='p12')
p13_table = read_data(name='p13')

S01_index = list(S01_table)
p12_index = list(p12_table)

same_residues = [x for x in S01_index if x in p12_index]

S01_filter = S01_table[same_residues]
S02_filter = S02_table[same_residues]
S03_filter = S03_table[same_residues]
S04_filter = S04_table[same_residues]
S05_filter = S05_table[same_residues]
S06_filter = S06_table[same_residues]
S07_filter = S07_table[same_residues]
S08_filter = S08_table[same_residues]
S09_filter = S09_table[same_residues]
S10_filter = S10_table[same_residues]
S11_filter = S11_table[same_residues]
p12_filter = p12_table[same_residues]
p13_filter = p13_table[same_residues]

In [5]:
def sort_residues(data):
    data_t = data.T
    res_lists = list(data_t.index.values)
    data_t["order"] = None

    for i in range(len(res_lists)):
        tmp_idx = res_lists[i]
        tmp_val = int(tmp_idx[4:])
        data_t.loc[tmp_idx, "order"] = tmp_val

    data_sort = data_t.sort_values(by="order")
    sort_data = data_sort.T
    serial_data = sort_data.drop("order")
    return serial_data


S01_serial = sort_residues(S01_filter)
S02_serial = sort_residues(S02_filter)
S03_serial = sort_residues(S03_filter)
S04_serial = sort_residues(S04_filter)
S05_serial = sort_residues(S05_filter)
S06_serial = sort_residues(S06_filter)
S07_serial = sort_residues(S07_filter)
S08_serial = sort_residues(S08_filter)
S09_serial = sort_residues(S09_filter)
S10_serial = sort_residues(S10_filter)
S11_serial = sort_residues(S11_filter)
p12_serial = sort_residues(p12_filter)
p13_serial = sort_residues(p13_filter)

In [6]:
S01_corr_matrix = S01_serial.corr(method="pearson")
S02_corr_matrix = S02_serial.corr(method="pearson")
S03_corr_matrix = S03_serial.corr(method="pearson")
S04_corr_matrix = S04_serial.corr(method="pearson")
S05_corr_matrix = S05_serial.corr(method="pearson")
S06_corr_matrix = S06_serial.corr(method="pearson")
S07_corr_matrix = S07_serial.corr(method="pearson")
S08_corr_matrix = S08_serial.corr(method="pearson")
S09_corr_matrix = S09_serial.corr(method="pearson")
S10_corr_matrix = S10_serial.corr(method="pearson")
S11_corr_matrix = S11_serial.corr(method="pearson")
p12_corr_matrix = p12_serial.corr(method="pearson")
p13_corr_matrix = p13_serial.corr(method="pearson")

S01_S02_diff = S01_corr_matrix.sub(S02_corr_matrix)
S01_S03_diff = S01_corr_matrix.sub(S03_corr_matrix)
S01_S04_diff = S01_corr_matrix.sub(S04_corr_matrix)
S01_S05_diff = S01_corr_matrix.sub(S05_corr_matrix)
S01_S06_diff = S01_corr_matrix.sub(S06_corr_matrix)
S01_S07_diff = S01_corr_matrix.sub(S07_corr_matrix)
S01_S08_diff = S01_corr_matrix.sub(S08_corr_matrix)
S01_S09_diff = S01_corr_matrix.sub(S09_corr_matrix)
S01_S10_diff = S01_corr_matrix.sub(S10_corr_matrix)
S01_S11_diff = S01_corr_matrix.sub(S11_corr_matrix)
S01_p12_diff = S01_corr_matrix.sub(p12_corr_matrix)
S01_p13_diff = S01_corr_matrix.sub(p13_corr_matrix)

In [7]:
S01_S02_matrix = np.array(S01_S02_diff)
S01_S03_matrix = np.array(S01_S03_diff)
S01_S04_matrix = np.array(S01_S04_diff)
S01_S05_matrix = np.array(S01_S05_diff)
S01_S06_matrix = np.array(S01_S06_diff)
S01_S07_matrix = np.array(S01_S07_diff)
S01_S08_matrix = np.array(S01_S08_diff)
S01_S09_matrix = np.array(S01_S09_diff)
S01_S10_matrix = np.array(S01_S10_diff)
S01_S11_matrix = np.array(S01_S11_diff)
S01_p12_matrix = np.array(S01_p12_diff)
S01_p13_matrix = np.array(S01_p13_diff)

In [13]:
def conclude_diff(matrix, subtraction):
    diff_data = pd.DataFrame(columns=["Subtraction", "Range", "Count"])
    low_count = 0
    tiny_count = 0
    medium_count = 0
    sig_count = 0
    high_count = 0

    for i in range(len(matrix)-1):
        temp_list = matrix[i][i+1:]
        for j in range(len(temp_list)):
            temp_val = abs(temp_list[j])
            if ((temp_val >= 0) & (temp_val < 0.1)):
                low_count += 1
            elif ((temp_val >= 0.1) & (temp_val < 0.3)):
                tiny_count += 1
            elif ((temp_val >= 0.3) & (temp_val < 0.5)):
                medium_count += 1
            elif ((temp_val >= 0.5) & (temp_val < 0.7)):
                sig_count += 1
            elif temp_val >= 0.7:
                high_count += 1

    range_list = ["0~0.1", "0.1~0.3", "0.3~0.5", "0.5~0.7", "0.7~2"]
    count_list = [low_count, tiny_count, medium_count, sig_count, high_count]
    diff_data["Range"] = range_list
    diff_data["Count"] = count_list
    diff_data["Subtraction"] = subtraction
    return diff_data


S01_S02_conclude = conclude_diff(S01_S02_matrix, "S01-S02")
S01_S03_conclude = conclude_diff(S01_S03_matrix, "S01-S03")
S01_S04_conclude = conclude_diff(S01_S04_matrix, "S01-S04")
S01_S05_conclude = conclude_diff(S01_S05_matrix, "S01-S05")
S01_S06_conclude = conclude_diff(S01_S06_matrix, "S01-S06")
S01_S07_conclude = conclude_diff(S01_S07_matrix, "S01-S07")
S01_S08_conclude = conclude_diff(S01_S08_matrix, "S01-S08")
S01_S09_conclude = conclude_diff(S01_S09_matrix, "S01-S09")
S01_S10_conclude = conclude_diff(S01_S10_matrix, "S01-S10")
S01_S11_conclude = conclude_diff(S01_S11_matrix, "S01-S11")
S01_p12_conclude = conclude_diff(S01_p12_matrix, "S01-p12")
S01_p13_conclude = conclude_diff(S01_p13_matrix, "S01-p13")

In [15]:
conclude_diff = pd.concat([S01_S02_conclude, S01_S03_conclude, S01_S04_conclude,
                           S01_S05_conclude, S01_S06_conclude, S01_S07_conclude,
                           S01_S08_conclude, S01_S09_conclude, S01_S10_conclude,
                           S01_S11_conclude, S01_p12_conclude, S01_p13_conclude],
                          ignore_index=True)

In [17]:
conclude_diff.to_csv('./processed/subtraction/diff_data_v2.csv', index=False)

In [31]:
serial_res = list(S01_S02_diff.index.values)


def extract_imp_diff(matrix, subtraction):
    imp_diff = pd.DataFrame(columns=["Residues", "Difference", "Range", "Subtraction"])
    imp_list = []
    imp_val = []
    imp_range = []

    for i in range(len(matrix)-1):
        temp_list = matrix[i][i+1:]
        for j in range(len(temp_list)):
            if abs(temp_list[j]) >= 0.5:
                imp_list.append((serial_res[i], serial_res[i+j+1]))
                imp_val.append(temp_list[j])
                if abs(temp_list[j]) < 0.7:
                    imp_range.append("0.5~0.7")
                elif abs(temp_list[j]) >= 0.7:
                    imp_range.append("0.7~2")

    imp_diff["Residues"] = imp_list
    imp_diff["Difference"] = imp_val
    imp_diff["Range"] = imp_range
    imp_diff["Subtraction"] = subtraction
    return imp_diff


S01_S02_res = extract_imp_diff(S01_S02_matrix, "S01-S02")
S01_S03_res = extract_imp_diff(S01_S03_matrix, "S01-S03")
S01_S04_res = extract_imp_diff(S01_S04_matrix, "S01-S04")
S01_S05_res = extract_imp_diff(S01_S05_matrix, "S01-S05")
S01_S06_res = extract_imp_diff(S01_S06_matrix, "S01-S06")
S01_S07_res = extract_imp_diff(S01_S07_matrix, "S01-S07")
S01_S08_res = extract_imp_diff(S01_S08_matrix, "S01-S08")
S01_S09_res = extract_imp_diff(S01_S09_matrix, "S01-S09")
S01_S10_res = extract_imp_diff(S01_S10_matrix, "S01-S10")
S01_S11_res = extract_imp_diff(S01_S11_matrix, "S01-S11")
S01_p12_res = extract_imp_diff(S01_p12_matrix, "S01-p12")
S01_p13_res = extract_imp_diff(S01_p13_matrix, "S01-p13")

In [34]:
diff_res = pd.concat([S01_S02_res, S01_S03_res, S01_S04_res,
                      S01_S05_res, S01_S06_res, S01_S07_res,
                      S01_S08_res, S01_S09_res, S01_S10_res,
                      S01_S11_res, S01_p12_res, S01_p13_res],
                     ignore_index=True)

In [36]:
diff_res.to_csv('./processed/subtraction/diff_residues.csv', index=False)