In [4]:
import warnings
import pandas as pd
from pathlib import Path

In [17]:
# lists of interesting residues
monomer_A = ['HIS_41', 'LYS_90', 'CYS_145', 'HIS_163', 'HIS_164', 'GLU_166', 'HIS_172']
monomer_B = ['HIS_347', 'LYS_396', 'CYS_451', 'HIS_469', 'HIS_470', 'GLU_472', 'HIS_478']

# path names
root = Path().resolve()
path = root/'dataset'/'Henry_Mpro_pKa'


# function to read the data
def read_data(name):
    table_data = pd.DataFrame({})
    dataset_path = path/name
    for dp in dataset_path.iterdir():
        if dp.name in monomer_A or dp.name in monomer_B:
            temp_table = pd.read_table(dp, header=None, names=['#1', '#2'])
            temp_array = (list)(temp_table['#2'])
            table_data[dp.name] = temp_array
            warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    return table_data

In [18]:
S01_table = read_data(name='S01')
S02_table = read_data(name='S02')
S03_table = read_data(name='S03')
S04_table = read_data(name='S04')
S05_table = read_data(name='S05')
S06_table = read_data(name='S06')
S07_table = read_data(name='S07')
S08_table = read_data(name='S08')
S09_table = read_data(name='S09')
S10_table = read_data(name='S10')
S11_table = read_data(name='S11')
p12_table = read_data(name='p12')
p13_table = read_data(name='p13')

In [19]:
S01_table.head()

Unnamed: 0,HIS_470,HIS_164,HIS_347,HIS_172,HIS_163,GLU_472,HIS_478,LYS_90,LYS_396,HIS_469,GLU_166,CYS_451,CYS_145,HIS_41
0,1.77,2.28,4.4,5.04,1.79,2.18,5.73,11.38,11.37,2.52,1.68,11.72,13.04,4.04
1,2.16,1.5,4.39,5.15,2.46,1.87,5.83,11.37,11.37,1.5,2.39,11.75,13.05,4.27
2,1.74,1.72,4.39,5.4,2.43,3.64,4.79,10.54,11.41,2.75,2.14,11.33,12.71,3.52
3,2.23,2.56,4.52,5.21,1.32,3.2,4.87,11.47,11.37,1.85,1.88,11.89,12.97,4.28
4,1.69,1.25,4.51,5.18,2.32,3.14,4.76,11.4,11.42,3.26,1.51,11.21,13.01,3.32


In [40]:
# the function to perform pearson correlation analysis
def get_high_corr_features(table):
    corr_matrix = table.corr(method='pearson')
    threshold = 0.75

    corr_pairs = corr_matrix.stack().reset_index()
    corr_pairs.columns = ['Feature1', 'Feature2', 'Correlation']
    corr_features = corr_pairs[
        (corr_pairs['Correlation'].abs() > threshold) &
        (corr_pairs['Feature1'] != corr_pairs['Feature2'])
    ].sort_values(by='Correlation', ascending=False).drop_duplicates(subset=['Correlation'])

    feature_1_array = (list)(corr_features['Feature1'])
    feature_2_array = (list)(corr_features['Feature2'])

    high_features_array = list(set(feature_1_array + feature_2_array))

    return corr_matrix, corr_features, high_features_array

In [41]:
S01_corr_matrix, S01_corr_features, S01_high_features_array = get_high_corr_features(S01_table)
S02_corr_matrix, S02_corr_features, S02_high_features_array = get_high_corr_features(S02_table)
S03_corr_matrix, S03_corr_features, S03_high_features_array = get_high_corr_features(S03_table)
S04_corr_matrix, S04_corr_features, S04_high_features_array = get_high_corr_features(S04_table)
S05_corr_matrix, S05_corr_features, S05_high_features_array = get_high_corr_features(S05_table)
S06_corr_matrix, S06_corr_features, S06_high_features_array = get_high_corr_features(S06_table)
S07_corr_matrix, S07_corr_features, S07_high_features_array = get_high_corr_features(S07_table)
S08_corr_matrix, S08_corr_features, S08_high_features_array = get_high_corr_features(S08_table)
S09_corr_matrix, S09_corr_features, S09_high_features_array = get_high_corr_features(S09_table)
S10_corr_matrix, S10_corr_features, S10_high_features_array = get_high_corr_features(S10_table)
S11_corr_matrix, S11_corr_features, S11_high_features_array = get_high_corr_features(S11_table)
p12_corr_matrix, p12_corr_features, p12_high_features_array = get_high_corr_features(p12_table)
p13_corr_matrix, p13_corr_features, p13_high_features_array = get_high_corr_features(p13_table)

In [42]:
print(S01_corr_features)
print(S01_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.823668
52  HIS_172  GLU_166    -0.911223
['GLU_166', 'HIS_172', 'GLU_472', 'HIS_478']


In [43]:
print(S02_corr_features)
print(S02_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.857389
['GLU_472', 'HIS_478']


In [44]:
print(S03_corr_features)
print(S03_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.844867
['GLU_472', 'HIS_478']


In [45]:
print(S04_corr_features)
print(S04_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.848682
['GLU_472', 'HIS_478']


In [46]:
print(S05_corr_features)
print(S05_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.825217
['GLU_472', 'HIS_478']


In [47]:
print(S06_corr_features)
print(S06_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.753923
52  HIS_172  GLU_166    -0.789742
['GLU_166', 'HIS_172', 'GLU_472', 'HIS_478']


In [48]:
print(S07_corr_features)
print(S07_high_features_array)

   Feature1 Feature2  Correlation
52  HIS_172  GLU_166    -0.821920
76  GLU_472  HIS_478    -0.823446
['HIS_478', 'HIS_172', 'GLU_472', 'GLU_166']


In [49]:
print(S08_corr_features)
print(S08_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.830198
52  HIS_172  GLU_166    -0.881891
['GLU_166', 'HIS_172', 'GLU_472', 'HIS_478']


In [50]:
print(S09_corr_features)
print(S09_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.834356
52  HIS_172  GLU_166    -0.908449
['GLU_166', 'HIS_172', 'GLU_472', 'HIS_478']


In [51]:
print(S10_corr_features)
print(S10_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.835936
['GLU_472', 'HIS_478']


In [36]:
print(S11_corr_features)
print(S11_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478     -0.80807
['GLU_472', 'HIS_478']


In [52]:
print(p12_corr_features)
print(p12_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.807500
52  HIS_172  GLU_166    -0.857114
['GLU_166', 'HIS_172', 'GLU_472', 'HIS_478']


In [53]:
print(p13_corr_features)
print(p13_high_features_array)

   Feature1 Feature2  Correlation
76  GLU_472  HIS_478    -0.811433
['GLU_472', 'HIS_478']
