In [1]:
import sys
sys.path.append("../../smsl")

In [2]:
from smsl.config import ConfAgent
from smsl.makeDataframe import SpringAgent

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from scipy.stats import gaussian_kde, zscore

In [4]:
class HotSpotAgent(dict, ConfAgent):
    def __init__(self):
        ConfAgent.__init__(self)
        self.end_ns = self.time_num
        self.framesperns = self.frame_num // self.time_num
    def load_systems(self, system2offical_name):
        ori_path = os.getcwd()
        for system, system_offical_name in system2offical_name.items():
            os.chdir(f'../../{system}/d.fluctmatch')
            sp_agent = SpringAgent()
            sp_agent.load_TimeAgent()
            sp_agent.system_offical_name = system_offical_name
            self[system] = sp_agent
        os.chdir(ori_path)
    def read_all_k_b0_pairtype_df(self):
        for system, sp_agent in self.items():
            sp_agent.read_all_k_b0_pairtype_df()

In [5]:
system2offical_name = {
'propeller': 'Propeller',
'basket'   : 'Basket',
'chair'    : 'Chair',
'hybrid-i' : 'Hybrid-I',
'hybrid-ii': 'Hybrid-II',
'dsdna'    : 'dsDNA',
}

hs_agent = HotSpotAgent()
hs_agent.load_systems(system2offical_name)

In [6]:
hs_agent.read_all_k_b0_pairtype_df()

Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/0000_0500/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/0250_0750/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/0500_1000/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/0750_1250/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/1000_1500/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fluctmatch/1250_1750/result/pairtypes_k_b0_cutoff_5.00.csv
Reading data from: /home/user/workfolder/g4dna_protocol_demo/propeller/d.fluctmatch/bigtraj_fl

In [7]:
index_order = [
    'bm', 'mm',
    'hb',
    'ss_st', 'cs_st', 'cl_st', 'll_st',
    'rb', 'pr0', 'pr1',
    'bb0',
    'bbb', 'brb', 'bpr0', 'bpr1',
]

criteria = 1e-1

In [8]:
system_offical_name2category2counts = {}
for system, sp_agent in hs_agent.items():
    df = sp_agent.t_agent.mean.df_all_k
    if sp_agent.type_na=='dsdna':
        df = df[df['Strand_i']!='STRAND2']
        df = df[df['Category']!='cs_st']
    df = df[df['k']>criteria]
    categories, counts = np.unique(df['Category'], return_counts=True)
    category2counts = dict(zip(categories, counts))
    for category in index_order:
        category2counts.setdefault(category, 0)
    system_offical_name2category2counts[sp_agent.system_offical_name] = category2counts
df_category_counts = pd.DataFrame(system_offical_name2category2counts).loc[index_order]

In [9]:
## 表5.3 & 表B.10
df_category_counts

Unnamed: 0,Propeller,Basket,Chair,Hybrid-I,Hybrid-II,dsDNA
bm,44,40,41,40,39,0
mm,1,1,1,1,1,0
hb,163,141,141,141,131,441
ss_st,308,257,241,253,231,481
cs_st,86,67,107,79,88,0
cl_st,28,175,218,177,182,0
ll_st,89,128,93,136,69,0
rb,644,602,692,584,608,388
pr0,354,270,316,288,308,276
pr1,450,352,327,319,348,229


In [10]:
system_offical_name2category2kavg = {}
system_offical_name2category2kstd = {}
for system, sp_agent in hs_agent.items():
    time_label2kavg = {}
    df = sp_agent.t_agent.mean.df_all_k
    mask1 = df['Strand_i']!='STRAND2'
    mask2 = df['Category']!='DS_ST'
    mask3 = df['k']>criteria
    if sp_agent.type_na=='dsdna':
        mask = mask1 & mask2 & mask3
    else:
        mask = mask3
    for time_label, st_agent in sp_agent.t_agent.items():
        df = st_agent.df_all_k
        df = df[mask]
        ps_category2kavg = df.groupby('Category')['k'].mean()
        time_label2kavg[time_label] = ps_category2kavg
    category2kavg = pd.concat(time_label2kavg.values(), axis=1).mean(axis=1).to_dict()
    category2kstd = pd.concat(time_label2kavg.values(), axis=1).std(axis=1).to_dict()    
    system_offical_name2category2kavg[sp_agent.system_offical_name] = category2kavg
    system_offical_name2category2kstd[sp_agent.system_offical_name] = category2kstd
df_category_kavg = pd.DataFrame(system_offical_name2category2kavg)
df_category_kstd = pd.DataFrame(system_offical_name2category2kstd)

In [11]:
## 取小數點位數
df_category_kavg_r = df_category_kavg
mask1 = df_category_kavg_r.mean(axis=1)>10 ## 如果k_avg > 10, x.xx ± x.x
mask2 = np.logical_not(mask1)              ## 如果k_avg < 10, xx.x ± x.x
df_category_kavg_r2 = pd.concat([df_category_kavg_r[mask2].round(2), df_category_kavg_r[mask1].round(1)])
df_category_kavg_display = df_category_kavg_r2.fillna('')
df_category_kstd_display = df_category_kstd.round(1).fillna('')
df_category_kavg_kstd_display = (df_category_kavg_display.astype('str')+' ± '+df_category_kstd_display.astype('str')).replace(' ± ', '-').loc[index_order]

In [12]:
## 表5.4 & 表B.11
df_category_kavg_kstd_display

Unnamed: 0,Propeller,Basket,Chair,Hybrid-I,Hybrid-II,dsDNA
bm,4.29 ± 0.2,5.08 ± 0.2,6.12 ± 0.4,5.15 ± 0.1,5.12 ± 0.2,-
mm,4.91 ± 0.6,6.34 ± 0.3,6.27 ± 0.6,6.45 ± 0.5,3.12 ± 1.0,-
hb,1.94 ± 0.1,2.64 ± 0.0,2.12 ± 0.1,2.55 ± 0.1,2.56 ± 0.1,2.11 ± 0.1
ss_st,1.84 ± 0.1,2.3 ± 0.1,2.06 ± 0.1,2.31 ± 0.1,2.28 ± 0.1,1.42 ± 0.0
cs_st,0.58 ± 0.1,1.14 ± 0.1,1.49 ± 0.1,1.28 ± 0.0,1.5 ± 0.1,0.87 ± 0.0
cl_st,0.24 ± 0.2,1.3 ± 0.2,1.25 ± 0.2,1.59 ± 0.2,1.08 ± 0.3,-
ll_st,0.51 ± 0.3,0.88 ± 0.1,0.76 ± 0.2,0.91 ± 0.2,0.66 ± 0.2,-
rb,4.07 ± 0.2,4.47 ± 0.1,4.12 ± 0.1,4.36 ± 0.1,4.4 ± 0.1,5.32 ± 0.2
pr0,3.18 ± 0.2,4.29 ± 0.1,4.64 ± 0.2,4.21 ± 0.1,4.09 ± 0.3,3.72 ± 0.2
pr1,1.89 ± 0.1,3.06 ± 0.1,3.29 ± 0.2,3.36 ± 0.1,2.9 ± 0.1,3.38 ± 0.1


### Base-Metal (BM）

In [13]:
## 3.2 內文
atompair_per_rp = df_category_counts.loc['bm', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/16
print('Base-Metal:', atompair_per_rp, 'per residue pair')

Base-Metal: 2.55 per residue pair


### Hydrogen Bond (HB)

In [14]:
seq = hs_agent['dsdna'].strandid2sequence['STRAND1']
seq_a = [i for i,s in enumerate(seq) if s=='A']
seq_g = [i for i,s in enumerate(seq) if s=='G']
seq_t = [i for i,s in enumerate(seq) if s=='T']
basetype2seq = {
'A': [i for i,s in enumerate(seq) if s=='A'],
'G': [i for i,s in enumerate(seq) if s=='G'],
'T': [i for i,s in enumerate(seq) if s=='T'],
}

In [15]:
## 3.2 內文
atompair_per_rp = df_category_counts.loc['hb', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/12
print('G-quadruplex G:G pair of Hydrogen Bond:', f'{atompair_per_rp:.2f}', 'per residue pair')
df = hs_agent['dsdna'].t_agent.mean.df_all_k
df = df[df['k']>criteria]
df = df[df['Category']=='hb']
mask1 = np.isin(df['Resid_i'], basetype2seq['A'])
mask2 = np.isin(df['Resid_i'], basetype2seq['T'])
atompair_per_rp = len(df[mask1|mask2]) / 10
print('DsDNA        G:C pair of Hydrogen Bond:', f'{atompair_per_rp:.2f}', 'per residue pair')
mask1 = np.isin(df['Resid_i'], basetype2seq['G'])
atompair_per_rp = len(df[mask1]) / 12
print('DsDNA        A:T pair of Hydrogen Bond:', f'{atompair_per_rp:.2f}', 'per residue pair')

G-quadruplex G:G pair of Hydrogen Bond: 11.95 per residue pair
DsDNA        G:C pair of Hydrogen Bond: 19.70 per residue pair
DsDNA        A:T pair of Hydrogen Bond: 20.33 per residue pair


### Base Stacking (ST)

In [16]:
atompair_per_rp = df_category_counts.loc['ss_st', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/8
print('G-quadruplex Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')
df = hs_agent['dsdna'].t_agent.mean.df_all_k
df = df[df['k']>criteria]
df = df[df['Category']=='ss_st']
df = df[df['Strand_i']=='STRAND1']
sele_resids = [ 1,  7, 13, 19]
mask1 = np.isin(df['Resid_i'], sele_resids)
atompair_per_rp = len(df[mask1]) / len(sele_resids)
print('DsDNA     AG Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')
sele_resids = [ 2,  8, 14, 20]
mask1 = np.isin(df['Resid_i'], sele_resids)
atompair_per_rp = len(df[mask1]) / len(sele_resids)
print('DsDNA    GG1 Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')
sele_resids = [ 3,  9, 15, 21]
mask1 = np.isin(df['Resid_i'], sele_resids)
atompair_per_rp = len(df[mask1]) / len(sele_resids)
print('DsDNA    GG2 Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')
sele_resids = [ 4, 10, 16]
mask1 = np.isin(df['Resid_i'], sele_resids)
atompair_per_rp = len(df[mask1]) / len(sele_resids)
print('DsDNA     GT Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')
sele_resids = [ 5, 11, 17]
mask1 = np.isin(df['Resid_i'], sele_resids)
atompair_per_rp = len(df[mask1]) / len(sele_resids)
print('DsDNA     TT Base Stacking:', f'{atompair_per_rp:.2f}', 'per residue pair')

G-quadruplex Base Stacking: 32.25 per residue pair
DsDNA     AG Base Stacking: 20.75 per residue pair
DsDNA    GG1 Base Stacking: 23.75 per residue pair
DsDNA    GG2 Base Stacking: 26.50 per residue pair
DsDNA     GT Base Stacking: 30.67 per residue pair
DsDNA     TT Base Stacking: 24.00 per residue pair


### BackBone (rb, pr0, pr1)

In [17]:
atompair_per_rp = df_category_counts.loc['rb', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/22
print('G-quadruplex               ribose-base     :', f'{atompair_per_rp:.2f}', 'per residue pair')
atompair_per_rp = df_category_counts.loc['pr0', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/22
print('G-quadruplex same-residue  phosphate-ribose:', f'{atompair_per_rp:.2f}', 'per residue pair')
atompair_per_rp = df_category_counts.loc['pr1', ['Propeller','Basket','Chair','Hybrid-I','Hybrid-II']].mean()/21
print('G-quadruplex cross-residue phosphate-ribose:', f'{atompair_per_rp:.2f}', 'per residue pair')
print()
atompair_per_rp = df_category_counts.loc['rb', ['dsDNA']].mean()/22
print('DsDNA                      ribose-base     :', f'{atompair_per_rp:.2f}', 'per residue pair')
atompair_per_rp = df_category_counts.loc['pr0', ['dsDNA']].mean()/22
print('DsDNA        same-residue  phosphate-ribose:', f'{atompair_per_rp:.2f}', 'per residue pair')
atompair_per_rp = df_category_counts.loc['pr1', ['dsDNA']].mean()/21
print('DsDNA        cross-residue phosphate-ribose:', f'{atompair_per_rp:.2f}', 'per residue pair')

G-quadruplex               ribose-base     : 28.45 per residue pair
G-quadruplex same-residue  phosphate-ribose: 13.96 per residue pair
G-quadruplex cross-residue phosphate-ribose: 17.10 per residue pair

DsDNA                      ribose-base     : 17.64 per residue pair
DsDNA        same-residue  phosphate-ribose: 12.55 per residue pair
DsDNA        cross-residue phosphate-ribose: 10.90 per residue pair
