In [4]:
import pandas as pd

from d_2021.utils.import_utils import get_objs_in_csv
from d_2021.utils.math_utils import get_box_plot
from d_2021.utils.common_utils import *

In [5]:
def data_pre_threat():
    md_names, md_df = get_objs_in_csv("../dataset/train/Molecular_Descriptor.csv")
    print('总特征数:', len(md_names) - 1)
    error_names = []
    for md_name in md_names[1:]:
        row = md_df[md_name]
        row = str_list_2_int(row)
        min_num, Q1, Q2, Q3, max_num = get_box_plot(row, md_name, False)
        # 计算异常值数量
        error_count = 0
        for n in row:
            if n < min_num or n > max_num:
                error_count += 1
        if error_count > 100:
            error_names.append(md_name)
    print('异常值数量:', len(error_names))
    md_df_2 = remove_names(md_df, md_names, error_names)
    # 含0大于0.9的列
    zero_names = []
    for md_name in md_names[1:]:
        row = md_df_2[md_name]
        p = count_zero(row)
        if p > 0.9:
            zero_names.append(md_name)
    print('0值大于90%数量:', len(zero_names))
    md_df_3 = remove_names(md_df, md_names, zero_names)
    print('筛选后总数', len(md_names) - 1)
    return md_df_3, md_names

In [6]:
pre_threat_df, pre_threat_names = data_pre_threat()
pre_threat_df, pre_threat_names

总特征数: 729
异常值数量: 125
0值大于90%数量: 327
筛选后总数 277


(                                                 SMILES nAcid    ALogP  \
 0     Oc1ccc2O[C@H]([C@H](Sc2c1)C3CCCC3)c4ccc(OCCN5C...     0   -0.286   
 1     Oc1ccc2O[C@H]([C@H](Sc2c1)C3CCCCCC3)c4ccc(OCCN...     0   -0.862   
 2     Oc1ccc(cc1)[C@H]2Sc3cc(O)ccc3O[C@H]2c4ccc(OCCN...     0   0.7296   
 3     Oc1ccc2O[C@H]([C@@H](CC3CCCCC3)Sc2c1)c4ccc(OCC...     0  -0.3184   
 4     Oc1ccc2O[C@H]([C@@H](Cc3ccccc3)Sc2c1)c4ccc(OCC...     0   1.3551   
 ...                                                 ...   ...      ...   
 1969  COc1cc(OC)cc(\C=C\c2cccc(OS(=O)(=O)C3CC4OC3C(=...     0   1.8193   
 1970  Oc1ccc(cc1)C2=C(C3OC2CC3S(=O)(=O)Oc4cccc(\C=C\...     0   1.6903   
 1971  Oc1ccc(cc1)C2=C(C3OC2CC3S(=O)(=O)Oc4ccc(\C=C\c...     0   1.6903   
 1972  Oc1ccc(cc1)C2=C([C@H]3O[C@H]2C[C@@H]3S(=O)(=O)...     0   1.3365   
 1973  COc1cc(OC)cc(\C=C\c2ccc(OS(=O)(=O)[C@H]3C[C@H]...     0   1.8193   
 
           ALogp2       AMR       apol naAromAtom nAromBond nAtom nHeavyAtom  \
 0       0.081796 

In [7]:
era_names, era_df = get_objs_in_csv("../dataset/train/ERa_activity.csv")

In [8]:
def to_float_df(m_list: list):
    t_list = []
    for x in m_list:
        t_list.append(float(x))
    return pd.DataFrame(t_list)

In [39]:
from tqdm import tqdm


def get_sequence_divide_mean(m_df: pd.DataFrame, names: list):
    """
    计算每个值和当前列平均值的商
    :param m_df: 数据表
    :param names: 名称
    :return: 算好的表格
    """
    t_df = m_df.copy()
    for name in tqdm(names):
        t1_df = t_df[name]
        t_df_mean = to_float_df(t1_df.to_list()).mean()[0]
        for i in range(len(t1_df)):
            if type(t1_df[i]) == str:
                t1_df[i] = float(t1_df[i].strip())
            t1_df[i] = t1_df[i] / t_df_mean
    return t_df[names]

In [26]:
import os


def get_sequence(m_df, names, csv_path):
    """
    得到计算好的序列
    :param m_df: 数据表
    :param names: 名称
    :param csv_path: 路径
    :return: 处理好的序列
    """
    if os.path.exists(csv_path):
        sequence = pd.read_csv(csv_path)
    else:
        sequence = get_sequence_divide_mean(m_df, names)
        sequence.to_csv(csv_path, index=False)
    return sequence

In [27]:
# 得到母序列均值商
csv1_path = '../results/parent_sequence.csv'
parent_sequence = get_sequence(era_df, ['pIC50'], csv1_path)
# 得到子序列均值商
csv2_path = '../results/child_sequence.csv'
child_names = pre_threat_names[1:]
child_sequence = get_sequence(pre_threat_df, child_names, csv2_path)

In [35]:
parent_sequence['pIC50'][0]

1.3060669875566957

In [49]:
local_min = 100
local_max = 0
for name in child_names:
    # 对每一个子序列，计算对应值
    sequence = child_sequence[name]
    for i in range(len(sequence)):
        # 每一行的数相减
        t1 = sequence[i]
        t2 = parent_sequence['pIC50'][i]
        t = abs(t1 - t2)
        if t < local_min:
            local_min = t
        if t > local_max:
            local_max = t
a, b = local_min, local_max
a, b

KeyboardInterrupt: 

In [46]:
child_sequence

Unnamed: 0,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,...,MW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,-0.257620,0.024873,1.082034,1.223396,0.776860,0.741223,1.260788,1.102714,1.456989,1.150040,...,1.123157,1.121186,1.017490,0.988184,1.014120,0.706576,1.111227,1.015521,1.573647,1.101407
1,-0.776462,0.225953,1.131994,1.325450,0.776860,0.741223,1.378987,1.173857,1.633594,1.238505,...,1.194838,1.193685,1.017631,0.989174,1.014677,0.706580,1.297601,1.166769,1.957447,1.154487
2,0.657200,0.161872,1.200531,1.221661,1.165289,1.111834,1.221388,1.173857,1.280385,1.194273,...,1.184436,1.190031,1.014516,1.150397,1.303913,0.706579,1.307196,1.123555,0.999633,1.167757
3,-0.286805,0.030828,1.145209,1.325450,0.776860,0.741223,1.378987,1.173857,1.633594,1.238505,...,1.194838,1.192364,1.016505,0.987988,1.014009,0.706575,1.326387,1.080342,2.028608,1.154487
4,1.220631,0.558400,1.228499,1.259459,1.165289,1.111834,1.260788,1.173857,1.368687,1.238505,...,1.179375,1.192364,1.016505,0.987988,1.014009,0.706575,1.326387,1.080342,1.504846,1.154487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1969,1.638767,1.006495,1.524418,1.470641,1.553719,1.482446,1.438086,1.529571,1.324536,1.503899,...,1.529615,1.535554,1.004641,1.626838,2.512373,0.000000,2.628047,1.512478,0.851914,1.565856
1970,1.522568,0.868821,1.437971,1.368587,1.553719,1.482446,1.319887,1.458428,1.147931,1.415434,...,1.457934,1.465485,1.005569,1.592250,2.450396,0.000000,2.277444,1.426051,0.635394,1.512776
1971,1.522568,0.868821,1.437971,1.368587,1.553719,1.482446,1.319887,1.458428,1.147931,1.415434,...,1.457934,1.465460,1.005552,1.591875,2.449857,0.000000,2.369708,1.426051,0.635394,1.512776
1972,1.203877,0.543176,1.077244,1.043898,1.165289,1.111834,1.004690,1.102714,0.883024,1.061576,...,1.115179,1.110794,1.008060,1.267333,1.869012,0.000000,0.953271,1.080342,0.263736,1.154487


In [47]:
parent_sequence

Unnamed: 0,pIC50
0,1.306067
1,1.233643
2,1.291947
3,1.276763
4,1.234554
...,...
1969,0.931040
1970,0.888375
1971,1.172302
1972,1.197355


In [38]:
from tqdm import *

copy_child = child_sequence.copy()
p = 0.5
for name in tqdm(child_names):
    # 对每一个子序列，计算对应值
    sequence = copy_child[name]
    for i in range(len(sequence)):
        # 计算关联度
        sequence[i] = (a + p * b) / (abs(sequence[i] - parent_sequence['pIC50'][i]) + p * b)
copy_child

100%|██████████| 277/277 [02:18<00:00,  2.00it/s]


Unnamed: 0,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,...,MW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,0.998565,0.998824,0.999794,0.999924,0.999514,0.999481,0.999958,0.999813,0.999861,0.999857,...,0.999832,0.999830,0.999735,0.999708,0.999732,0.999449,0.999821,0.999733,0.999754,0.999812
1,0.998156,0.999075,0.999907,0.999916,0.999580,0.999548,0.999866,0.999945,0.999633,0.999996,...,0.999964,0.999963,0.999802,0.999775,0.999799,0.999516,0.999941,0.999939,0.999335,0.999927
2,0.999417,0.998962,0.999916,0.999935,0.999884,0.999834,0.999935,0.999891,0.999989,0.999910,...,0.999901,0.999906,0.999745,0.999870,0.999989,0.999462,0.999986,0.999845,0.999731,0.999886
3,0.998565,0.998856,0.999879,0.999955,0.999541,0.999508,0.999906,0.999905,0.999672,0.999965,...,0.999925,0.999922,0.999761,0.999735,0.999759,0.999476,0.999954,0.999820,0.999309,0.999888
4,0.999987,0.999379,0.999994,0.999977,0.999936,0.999887,0.999976,0.999944,0.999877,0.999996,...,0.999949,0.999961,0.999800,0.999773,0.999797,0.999515,0.999916,0.999858,0.999752,0.999926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1969,0.999350,0.999931,0.999455,0.999504,0.999428,0.999493,0.999534,0.999450,0.999638,0.999474,...,0.999450,0.999445,0.999932,0.999361,0.998549,0.999145,0.998443,0.999466,0.999927,0.999417
1970,0.999417,0.999982,0.999495,0.999559,0.999389,0.999454,0.999604,0.999476,0.999761,0.999516,...,0.999477,0.999470,0.999892,0.999353,0.998566,0.999184,0.998725,0.999506,0.999768,0.999426
1971,0.999678,0.999721,0.999756,0.999820,0.999650,0.999715,0.999864,0.999737,0.999978,0.999777,...,0.999738,0.999731,0.999847,0.999615,0.998827,0.998924,0.998901,0.999767,0.999507,0.999687
1972,0.999994,0.999399,0.999890,0.999859,0.999971,0.999921,0.999823,0.999913,0.999711,0.999875,...,0.999924,0.999920,0.999826,0.999936,0.999383,0.998901,0.999776,0.999892,0.999143,0.999961


In [40]:
def get_sequence_mean(m_df: pd.DataFrame):
    """
    计算只有一列的DataFrame均值
    :param m_df:
    :return:
    """
    return m_df.sum() / len(m_df)

In [45]:
sort_list = []
for name in tqdm(child_names):
    t = get_sequence_mean(child_sequence[name])
    sort_list.append({name: t})
sort_list = sorted(sort_list, key=lambda x: x[0])
sort_list

100%|██████████| 277/277 [00:00<00:00, 33450.10it/s]


KeyError: 0