In [1]:
import pandas as pd
import numpy as np

# 读取CSV文件
df = pd.read_csv('/Users/qiushifeng/Desktop/2025JantoJun_NewData/Unique/Overall.csv')  # 根据实际分隔符调整

print("数据基本信息:")
print(f"总记录数: {len(df)}")
print("\n" + "="*50)

# 1. 分子量 (Molecular_Weight) 统计
print("1. 分子量 (Molecular_Weight) 分布:")
mw_bins = [0, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]
mw_labels = ['<200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900-1000', '>1000']
df['MW_category'] = pd.cut(df['Molecular_Weight'], bins=mw_bins, labels=mw_labels, right=False)
mw_counts = df['MW_category'].value_counts().sort_index()
print("分子量分布:")
for category, count in mw_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(mw_counts.values)}")

print("\n" + "="*50)

# 2. 氢键受体 (Num_H_Acceptors) 统计
print("2. 氢键受体 (Num_H_Acceptors) 分布:")
hba_counts = df['Num_H_Acceptors'].value_counts().sort_index()
print("氢键受体分布:")
for hba, count in hba_counts.items():
    if hba >= 16:
        print(f"≥16: {hba_counts[hba_counts.index >= 16].sum()}")
        break
    print(f"{hba}: {count}")

# 创建HBA的JS数组 (2-15, ≥16)
hba_js_data = []
for i in range(2, 16):
    hba_js_data.append(hba_counts.get(i, 0))
hba_js_data.append(hba_counts[hba_counts.index >= 16].sum())
print(f"JS数组格式: {hba_js_data}")

print("\n" + "="*50)

# 3. 氢键供体 (Num_H_Donors) 统计
print("3. 氢键供体 (Num_H_Donors) 分布:")
hbd_counts = df['Num_H_Donors'].value_counts().sort_index()
print("氢键供体分布:")
for hbd, count in hbd_counts.items():
    if hbd >= 12:
        print(f"≥12: {hbd_counts[hbd_counts.index >= 12].sum()}")
        break
    print(f"{hbd}: {count}")

# 创建HBD的JS数组 (0-11, ≥12)
hbd_js_data = []
for i in range(0, 12):
    hbd_js_data.append(hbd_counts.get(i, 0))
hbd_js_data.append(hbd_counts[hbd_counts.index >= 12].sum())
print(f"JS数组格式: {hbd_js_data}")

print("\n" + "="*50)

# 4. cLogP 统计
print("4. cLogP 分布:")
clogp_bins = [-float('inf'), -1, 0, 1, 2, 3, 4, 5, 6, 7, float('inf')]
clogp_labels = ['<-1', '-1-0', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '>7']
df['cLogP_category'] = pd.cut(df['cLogP'], bins=clogp_bins, labels=clogp_labels, right=False)
clogp_counts = df['cLogP_category'].value_counts().sort_index()
print("cLogP分布:")
for category, count in clogp_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(clogp_counts.values)}")

print("\n" + "="*50)

# 5. TPSA 统计
print("5. TPSA 分布:")
tpsa_bins = [0, 20, 50, 80, 110, 140, 170, 200, 230, 260, 290, float('inf')]
tpsa_labels = ['<20', '20-50', '50-80', '80-110', '110-140', '140-170', '170-200', '200-230', '230-260', '260-290', '>290']
df['TPSA_category'] = pd.cut(df['TPSA'], bins=tpsa_bins, labels=tpsa_labels, right=False)
tpsa_counts = df['TPSA_category'].value_counts().sort_index()
print("TPSA分布:")
for category, count in tpsa_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(tpsa_counts.values)}")

print("\n" + "="*50)

# 6. 大环环大小 (Macrocycle_Ring_Size) 统计
print("6. 大环环大小 (Macrocycle_Ring_Size) 分布:")
mrs_counts = df['Macrocycle_Ring_Size'].value_counts().sort_index()
print("大环环大小分布:")
for mrs, count in mrs_counts.items():
    if mrs >= 24:
        print(f"≥24: {mrs_counts[mrs_counts.index >= 24].sum()}")
        break
    print(f"{mrs}: {count}")

# 创建MRS的JS数组 (12-23, ≥24)
mrs_js_data = []
for i in range(12, 24):
    mrs_js_data.append(mrs_counts.get(i, 0))
mrs_js_data.append(mrs_counts[mrs_counts.index >= 24].sum())
print(f"JS数组格式: {mrs_js_data}")

print("\n" + "="*50)

# 7. 酰胺比例 (Amide_Ratio) 统计
print("7. 酰胺比例 (Amide_Ratio) 分布:")
ar_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, float('inf')]
ar_labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '>0.7']
df['AR_category'] = pd.cut(df['Amide_Ratio'], bins=ar_bins, labels=ar_labels, right=False)
ar_counts = df['AR_category'].value_counts().sort_index()
print("酰胺比例分布:")
for category, count in ar_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ar_counts.values)}")

print("\n" + "="*50)

# 8. Kier柔性指数 (Kier_index) 统计
print("8. Kier柔性指数 (Kier_index) 分布:")
ki_bins = [0, 4, 8, 12, 16, 20, 24, 28, float('inf')]
ki_labels = ['<4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-28', '>28']
df['KI_category'] = pd.cut(df['Kier_index'], bins=ki_bins, labels=ki_labels, right=False)
ki_counts = df['KI_category'].value_counts().sort_index()
print("Kier柔性指数分布:")
for category, count in ki_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ki_counts.values)}")

print("\n" + "="*50)
print("所有统计完成!")

# 汇总所有JS数组数据
print("\n" + "="*50)
print("汇总 - 所有JS更新数据:")
print(f"分子量: {list(mw_counts.values)}")
print(f"氢键受体: {hba_js_data}")
print(f"氢键供体: {hbd_js_data}")
print(f"cLogP: {list(clogp_counts.values)}")
print(f"TPSA: {list(tpsa_counts.values)}")
print(f"大环环大小: {mrs_js_data}")
print(f"酰胺比例: {list(ar_counts.values)}")
print(f"Kier指数: {list(ki_counts.values)}")

数据基本信息:
总记录数: 4553

1. 分子量 (Molecular_Weight) 分布:
分子量分布:
<200: 0
200-300: 6
300-400: 130
400-500: 2163
500-600: 1482
600-700: 409
700-800: 146
800-900: 118
900-1000: 59
>1000: 40
JS数组格式: [0, 6, 130, 2163, 1482, 409, 146, 118, 59, 40]

2. 氢键受体 (Num_H_Acceptors) 分布:
氢键受体分布:
2: 5
3: 171
4: 612
5: 1037
6: 1224
7: 697
8: 324
9: 166
10: 83
11: 50
12: 58
13: 23
14: 27
15: 26
≥16: 50
JS数组格式: [5, 171, 612, 1037, 1224, 697, 324, 166, 83, 50, 58, 23, 27, 26, 50]

3. 氢键供体 (Num_H_Donors) 分布:
氢键供体分布:
0: 882
1: 1219
2: 1079
3: 723
4: 394
5: 125
6: 90
7: 18
8: 16
9: 3
10: 1
≥12: 3
JS数组格式: [882, 1219, 1079, 723, 394, 125, 90, 18, 16, 3, 1, 0, 3]

4. cLogP 分布:
cLogP分布:
<-1: 36
-1-0: 73
0-1: 237
1-2: 586
2-3: 993
3-4: 1200
4-5: 854
5-6: 431
6-7: 110
>7: 33
JS数组格式: [36, 73, 237, 586, 993, 1200, 854, 431, 110, 33]

5. TPSA 分布:
TPSA分布:
<20: 0
20-50: 170
50-80: 1274
80-110: 1324
110-140: 1057
140-170: 373
170-200: 182
200-230: 80
230-260: 34
260-290: 16
>290: 43
JS数组格式: [0, 170, 1274, 1324, 1057, 373, 182, 8

In [2]:

# 读取CSV文件
df = pd.read_csv('/Users/qiushifeng/Desktop/2025JantoJun_NewData/Unique/Caco-2.csv')  # 根据实际分隔符调整

print("数据基本信息:")
print(f"总记录数: {len(df)}")
print("\n" + "="*50)

# 1. 分子量 (Molecular_Weight) 统计
print("1. 分子量 (Molecular_Weight) 分布:")
mw_bins = [0, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]
mw_labels = ['<200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900-1000', '>1000']
df['MW_category'] = pd.cut(df['Molecular_Weight'], bins=mw_bins, labels=mw_labels, right=False)
mw_counts = df['MW_category'].value_counts().sort_index()
print("分子量分布:")
for category, count in mw_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(mw_counts.values)}")

print("\n" + "="*50)

# 2. 氢键受体 (Num_H_Acceptors) 统计
print("2. 氢键受体 (Num_H_Acceptors) 分布:")
hba_counts = df['Num_H_Acceptors'].value_counts().sort_index()
print("氢键受体分布:")
for hba, count in hba_counts.items():
    if hba >= 16:
        print(f"≥16: {hba_counts[hba_counts.index >= 16].sum()}")
        break
    print(f"{hba}: {count}")

# 创建HBA的JS数组 (2-15, ≥16)
hba_js_data = []
for i in range(2, 16):
    hba_js_data.append(hba_counts.get(i, 0))
hba_js_data.append(hba_counts[hba_counts.index >= 16].sum())
print(f"JS数组格式: {hba_js_data}")

print("\n" + "="*50)

# 3. 氢键供体 (Num_H_Donors) 统计
print("3. 氢键供体 (Num_H_Donors) 分布:")
hbd_counts = df['Num_H_Donors'].value_counts().sort_index()
print("氢键供体分布:")
for hbd, count in hbd_counts.items():
    if hbd >= 12:
        print(f"≥12: {hbd_counts[hbd_counts.index >= 12].sum()}")
        break
    print(f"{hbd}: {count}")

# 创建HBD的JS数组 (0-11, ≥12)
hbd_js_data = []
for i in range(0, 12):
    hbd_js_data.append(hbd_counts.get(i, 0))
hbd_js_data.append(hbd_counts[hbd_counts.index >= 12].sum())
print(f"JS数组格式: {hbd_js_data}")

print("\n" + "="*50)

# 4. cLogP 统计
print("4. cLogP 分布:")
clogp_bins = [-float('inf'), -1, 0, 1, 2, 3, 4, 5, 6, 7, float('inf')]
clogp_labels = ['<-1', '-1-0', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '>7']
df['cLogP_category'] = pd.cut(df['cLogP'], bins=clogp_bins, labels=clogp_labels, right=False)
clogp_counts = df['cLogP_category'].value_counts().sort_index()
print("cLogP分布:")
for category, count in clogp_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(clogp_counts.values)}")

print("\n" + "="*50)

# 5. TPSA 统计
print("5. TPSA 分布:")
tpsa_bins = [0, 20, 50, 80, 110, 140, 170, 200, 230, 260, 290, float('inf')]
tpsa_labels = ['<20', '20-50', '50-80', '80-110', '110-140', '140-170', '170-200', '200-230', '230-260', '260-290', '>290']
df['TPSA_category'] = pd.cut(df['TPSA'], bins=tpsa_bins, labels=tpsa_labels, right=False)
tpsa_counts = df['TPSA_category'].value_counts().sort_index()
print("TPSA分布:")
for category, count in tpsa_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(tpsa_counts.values)}")

print("\n" + "="*50)

# 6. 大环环大小 (Macrocycle_Ring_Size) 统计
print("6. 大环环大小 (Macrocycle_Ring_Size) 分布:")
mrs_counts = df['Macrocycle_Ring_Size'].value_counts().sort_index()
print("大环环大小分布:")
for mrs, count in mrs_counts.items():
    if mrs >= 24:
        print(f"≥24: {mrs_counts[mrs_counts.index >= 24].sum()}")
        break
    print(f"{mrs}: {count}")

# 创建MRS的JS数组 (12-23, ≥24)
mrs_js_data = []
for i in range(12, 24):
    mrs_js_data.append(mrs_counts.get(i, 0))
mrs_js_data.append(mrs_counts[mrs_counts.index >= 24].sum())
print(f"JS数组格式: {mrs_js_data}")

print("\n" + "="*50)

# 7. 酰胺比例 (Amide_Ratio) 统计
print("7. 酰胺比例 (Amide_Ratio) 分布:")
ar_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, float('inf')]
ar_labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '>0.7']
df['AR_category'] = pd.cut(df['Amide_Ratio'], bins=ar_bins, labels=ar_labels, right=False)
ar_counts = df['AR_category'].value_counts().sort_index()
print("酰胺比例分布:")
for category, count in ar_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ar_counts.values)}")

print("\n" + "="*50)

# 8. Kier柔性指数 (Kier_index) 统计
print("8. Kier柔性指数 (Kier_index) 分布:")
ki_bins = [0, 4, 8, 12, 16, 20, 24, 28, float('inf')]
ki_labels = ['<4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-28', '>28']
df['KI_category'] = pd.cut(df['Kier_index'], bins=ki_bins, labels=ki_labels, right=False)
ki_counts = df['KI_category'].value_counts().sort_index()
print("Kier柔性指数分布:")
for category, count in ki_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ki_counts.values)}")

print("\n" + "="*50)
print("所有统计完成!")

# 汇总所有JS数组数据
print("\n" + "="*50)
print("汇总 - 所有JS更新数据:")
print(f"分子量: {list(mw_counts.values)}")
print(f"氢键受体: {hba_js_data}")
print(f"氢键供体: {hbd_js_data}")
print(f"cLogP: {list(clogp_counts.values)}")
print(f"TPSA: {list(tpsa_counts.values)}")
print(f"大环环大小: {mrs_js_data}")
print(f"酰胺比例: {list(ar_counts.values)}")
print(f"Kier指数: {list(ki_counts.values)}")

数据基本信息:
总记录数: 555

1. 分子量 (Molecular_Weight) 分布:
分子量分布:
<200: 0
200-300: 3
300-400: 17
400-500: 163
500-600: 155
600-700: 94
700-800: 83
800-900: 35
900-1000: 4
>1000: 1
JS数组格式: [0, 3, 17, 163, 155, 94, 83, 35, 4, 1]

2. 氢键受体 (Num_H_Acceptors) 分布:
氢键受体分布:
2: 4
4: 44
5: 42
6: 97
7: 135
8: 79
9: 57
10: 36
11: 30
12: 14
13: 8
14: 5
15: 1
≥16: 3
JS数组格式: [4, 0, 44, 42, 97, 135, 79, 57, 36, 30, 14, 8, 5, 1, 3]

3. 氢键供体 (Num_H_Donors) 分布:
氢键供体分布:
0: 32
1: 87
2: 169
3: 156
4: 88
5: 18
6: 5
JS数组格式: [32, 87, 169, 156, 88, 18, 5, 0, 0, 0, 0, 0, 0]

4. cLogP 分布:
cLogP分布:
<-1: 0
-1-0: 8
0-1: 27
1-2: 72
2-3: 124
3-4: 102
4-5: 79
5-6: 108
6-7: 26
>7: 9
JS数组格式: [0, 8, 27, 72, 124, 102, 79, 108, 26, 9]

5. TPSA 分布:
TPSA分布:
<20: 0
20-50: 1
50-80: 34
80-110: 161
110-140: 162
140-170: 96
170-200: 65
200-230: 35
230-260: 1
260-290: 0
>290: 0
JS数组格式: [0, 1, 34, 161, 162, 96, 65, 35, 1, 0, 0]

6. 大环环大小 (Macrocycle_Ring_Size) 分布:
大环环大小分布:
12: 115
13: 50
14: 148
15: 62
16: 60
17: 5
18: 62
19: 15
20: 3
21: 19
2

In [3]:

# 读取CSV文件
df = pd.read_csv('/Users/qiushifeng/Desktop/2025JantoJun_NewData/Unique/MDCK.csv')  # 根据实际分隔符调整

print("数据基本信息:")
print(f"总记录数: {len(df)}")
print("\n" + "="*50)

# 1. 分子量 (Molecular_Weight) 统计
print("1. 分子量 (Molecular_Weight) 分布:")
mw_bins = [0, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]
mw_labels = ['<200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900-1000', '>1000']
df['MW_category'] = pd.cut(df['Molecular_Weight'], bins=mw_bins, labels=mw_labels, right=False)
mw_counts = df['MW_category'].value_counts().sort_index()
print("分子量分布:")
for category, count in mw_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(mw_counts.values)}")

print("\n" + "="*50)

# 2. 氢键受体 (Num_H_Acceptors) 统计
print("2. 氢键受体 (Num_H_Acceptors) 分布:")
hba_counts = df['Num_H_Acceptors'].value_counts().sort_index()
print("氢键受体分布:")
for hba, count in hba_counts.items():
    if hba >= 16:
        print(f"≥16: {hba_counts[hba_counts.index >= 16].sum()}")
        break
    print(f"{hba}: {count}")

# 创建HBA的JS数组 (2-15, ≥16)
hba_js_data = []
for i in range(2, 16):
    hba_js_data.append(hba_counts.get(i, 0))
hba_js_data.append(hba_counts[hba_counts.index >= 16].sum())
print(f"JS数组格式: {hba_js_data}")

print("\n" + "="*50)

# 3. 氢键供体 (Num_H_Donors) 统计
print("3. 氢键供体 (Num_H_Donors) 分布:")
hbd_counts = df['Num_H_Donors'].value_counts().sort_index()
print("氢键供体分布:")
for hbd, count in hbd_counts.items():
    if hbd >= 12:
        print(f"≥12: {hbd_counts[hbd_counts.index >= 12].sum()}")
        break
    print(f"{hbd}: {count}")

# 创建HBD的JS数组 (0-11, ≥12)
hbd_js_data = []
for i in range(0, 12):
    hbd_js_data.append(hbd_counts.get(i, 0))
hbd_js_data.append(hbd_counts[hbd_counts.index >= 12].sum())
print(f"JS数组格式: {hbd_js_data}")

print("\n" + "="*50)

# 4. cLogP 统计
print("4. cLogP 分布:")
clogp_bins = [-float('inf'), -1, 0, 1, 2, 3, 4, 5, 6, 7, float('inf')]
clogp_labels = ['<-1', '-1-0', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '>7']
df['cLogP_category'] = pd.cut(df['cLogP'], bins=clogp_bins, labels=clogp_labels, right=False)
clogp_counts = df['cLogP_category'].value_counts().sort_index()
print("cLogP分布:")
for category, count in clogp_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(clogp_counts.values)}")

print("\n" + "="*50)

# 5. TPSA 统计
print("5. TPSA 分布:")
tpsa_bins = [0, 20, 50, 80, 110, 140, 170, 200, 230, 260, 290, float('inf')]
tpsa_labels = ['<20', '20-50', '50-80', '80-110', '110-140', '140-170', '170-200', '200-230', '230-260', '260-290', '>290']
df['TPSA_category'] = pd.cut(df['TPSA'], bins=tpsa_bins, labels=tpsa_labels, right=False)
tpsa_counts = df['TPSA_category'].value_counts().sort_index()
print("TPSA分布:")
for category, count in tpsa_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(tpsa_counts.values)}")

print("\n" + "="*50)

# 6. 大环环大小 (Macrocycle_Ring_Size) 统计
print("6. 大环环大小 (Macrocycle_Ring_Size) 分布:")
mrs_counts = df['Macrocycle_Ring_Size'].value_counts().sort_index()
print("大环环大小分布:")
for mrs, count in mrs_counts.items():
    if mrs >= 24:
        print(f"≥24: {mrs_counts[mrs_counts.index >= 24].sum()}")
        break
    print(f"{mrs}: {count}")

# 创建MRS的JS数组 (12-23, ≥24)
mrs_js_data = []
for i in range(12, 24):
    mrs_js_data.append(mrs_counts.get(i, 0))
mrs_js_data.append(mrs_counts[mrs_counts.index >= 24].sum())
print(f"JS数组格式: {mrs_js_data}")

print("\n" + "="*50)

# 7. 酰胺比例 (Amide_Ratio) 统计
print("7. 酰胺比例 (Amide_Ratio) 分布:")
ar_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, float('inf')]
ar_labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '>0.7']
df['AR_category'] = pd.cut(df['Amide_Ratio'], bins=ar_bins, labels=ar_labels, right=False)
ar_counts = df['AR_category'].value_counts().sort_index()
print("酰胺比例分布:")
for category, count in ar_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ar_counts.values)}")

print("\n" + "="*50)

# 8. Kier柔性指数 (Kier_index) 统计
print("8. Kier柔性指数 (Kier_index) 分布:")
ki_bins = [0, 4, 8, 12, 16, 20, 24, 28, float('inf')]
ki_labels = ['<4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-28', '>28']
df['KI_category'] = pd.cut(df['Kier_index'], bins=ki_bins, labels=ki_labels, right=False)
ki_counts = df['KI_category'].value_counts().sort_index()
print("Kier柔性指数分布:")
for category, count in ki_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ki_counts.values)}")

print("\n" + "="*50)
print("所有统计完成!")

# 汇总所有JS数组数据
print("\n" + "="*50)
print("汇总 - 所有JS更新数据:")
print(f"分子量: {list(mw_counts.values)}")
print(f"氢键受体: {hba_js_data}")
print(f"氢键供体: {hbd_js_data}")
print(f"cLogP: {list(clogp_counts.values)}")
print(f"TPSA: {list(tpsa_counts.values)}")
print(f"大环环大小: {mrs_js_data}")
print(f"酰胺比例: {list(ar_counts.values)}")
print(f"Kier指数: {list(ki_counts.values)}")

数据基本信息:
总记录数: 305

1. 分子量 (Molecular_Weight) 分布:
分子量分布:
<200: 0
200-300: 2
300-400: 28
400-500: 211
500-600: 27
600-700: 21
700-800: 5
800-900: 4
900-1000: 2
>1000: 5
JS数组格式: [0, 2, 28, 211, 27, 21, 5, 4, 2, 5]

2. 氢键受体 (Num_H_Acceptors) 分布:
氢键受体分布:
4: 9
5: 129
6: 81
7: 41
8: 19
9: 13
11: 1
14: 8
≥16: 4
JS数组格式: [0, 0, 9, 129, 81, 41, 19, 13, 0, 1, 0, 0, 8, 0, 4]

3. 氢键供体 (Num_H_Donors) 分布:
氢键供体分布:
0: 4
1: 227
2: 21
3: 14
4: 27
5: 5
6: 2
7: 2
8: 2
≥12: 1
JS数组格式: [4, 227, 21, 14, 27, 5, 2, 2, 2, 0, 0, 0, 1]

4. cLogP 分布:
cLogP分布:
<-1: 1
-1-0: 0
0-1: 10
1-2: 59
2-3: 130
3-4: 64
4-5: 18
5-6: 18
6-7: 5
>7: 0
JS数组格式: [1, 0, 10, 59, 130, 64, 18, 18, 5, 0]

5. TPSA 分布:
TPSA分布:
<20: 0
20-50: 0
50-80: 45
80-110: 190
110-140: 41
140-170: 12
170-200: 6
200-230: 4
230-260: 5
260-290: 0
>290: 2
JS数组格式: [0, 0, 45, 190, 41, 12, 6, 4, 5, 0, 2]

6. 大环环大小 (Macrocycle_Ring_Size) 分布:
大环环大小分布:
12: 17
13: 187
14: 35
15: 9
16: 23
17: 5
18: 18
19: 5
20: 3
22: 1
≥24: 2
JS数组格式: [17, 187, 35, 9, 23, 5, 18, 5, 3, 

In [4]:

# 读取CSV文件
df = pd.read_csv('/Users/qiushifeng/Desktop/2025JantoJun_NewData/Unique/PAMPA.csv')  # 根据实际分隔符调整

print("数据基本信息:")
print(f"总记录数: {len(df)}")
print("\n" + "="*50)

# 1. 分子量 (Molecular_Weight) 统计
print("1. 分子量 (Molecular_Weight) 分布:")
mw_bins = [0, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]
mw_labels = ['<200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900-1000', '>1000']
df['MW_category'] = pd.cut(df['Molecular_Weight'], bins=mw_bins, labels=mw_labels, right=False)
mw_counts = df['MW_category'].value_counts().sort_index()
print("分子量分布:")
for category, count in mw_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(mw_counts.values)}")

print("\n" + "="*50)

# 2. 氢键受体 (Num_H_Acceptors) 统计
print("2. 氢键受体 (Num_H_Acceptors) 分布:")
hba_counts = df['Num_H_Acceptors'].value_counts().sort_index()
print("氢键受体分布:")
for hba, count in hba_counts.items():
    if hba >= 16:
        print(f"≥16: {hba_counts[hba_counts.index >= 16].sum()}")
        break
    print(f"{hba}: {count}")

# 创建HBA的JS数组 (2-15, ≥16)
hba_js_data = []
for i in range(2, 16):
    hba_js_data.append(hba_counts.get(i, 0))
hba_js_data.append(hba_counts[hba_counts.index >= 16].sum())
print(f"JS数组格式: {hba_js_data}")

print("\n" + "="*50)

# 3. 氢键供体 (Num_H_Donors) 统计
print("3. 氢键供体 (Num_H_Donors) 分布:")
hbd_counts = df['Num_H_Donors'].value_counts().sort_index()
print("氢键供体分布:")
for hbd, count in hbd_counts.items():
    if hbd >= 12:
        print(f"≥12: {hbd_counts[hbd_counts.index >= 12].sum()}")
        break
    print(f"{hbd}: {count}")

# 创建HBD的JS数组 (0-11, ≥12)
hbd_js_data = []
for i in range(0, 12):
    hbd_js_data.append(hbd_counts.get(i, 0))
hbd_js_data.append(hbd_counts[hbd_counts.index >= 12].sum())
print(f"JS数组格式: {hbd_js_data}")

print("\n" + "="*50)

# 4. cLogP 统计
print("4. cLogP 分布:")
clogp_bins = [-float('inf'), -1, 0, 1, 2, 3, 4, 5, 6, 7, float('inf')]
clogp_labels = ['<-1', '-1-0', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '>7']
df['cLogP_category'] = pd.cut(df['cLogP'], bins=clogp_bins, labels=clogp_labels, right=False)
clogp_counts = df['cLogP_category'].value_counts().sort_index()
print("cLogP分布:")
for category, count in clogp_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(clogp_counts.values)}")

print("\n" + "="*50)

# 5. TPSA 统计
print("5. TPSA 分布:")
tpsa_bins = [0, 20, 50, 80, 110, 140, 170, 200, 230, 260, 290, float('inf')]
tpsa_labels = ['<20', '20-50', '50-80', '80-110', '110-140', '140-170', '170-200', '200-230', '230-260', '260-290', '>290']
df['TPSA_category'] = pd.cut(df['TPSA'], bins=tpsa_bins, labels=tpsa_labels, right=False)
tpsa_counts = df['TPSA_category'].value_counts().sort_index()
print("TPSA分布:")
for category, count in tpsa_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(tpsa_counts.values)}")

print("\n" + "="*50)

# 6. 大环环大小 (Macrocycle_Ring_Size) 统计
print("6. 大环环大小 (Macrocycle_Ring_Size) 分布:")
mrs_counts = df['Macrocycle_Ring_Size'].value_counts().sort_index()
print("大环环大小分布:")
for mrs, count in mrs_counts.items():
    if mrs >= 24:
        print(f"≥24: {mrs_counts[mrs_counts.index >= 24].sum()}")
        break
    print(f"{mrs}: {count}")

# 创建MRS的JS数组 (12-23, ≥24)
mrs_js_data = []
for i in range(12, 24):
    mrs_js_data.append(mrs_counts.get(i, 0))
mrs_js_data.append(mrs_counts[mrs_counts.index >= 24].sum())
print(f"JS数组格式: {mrs_js_data}")

print("\n" + "="*50)

# 7. 酰胺比例 (Amide_Ratio) 统计
print("7. 酰胺比例 (Amide_Ratio) 分布:")
ar_bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, float('inf')]
ar_labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '>0.7']
df['AR_category'] = pd.cut(df['Amide_Ratio'], bins=ar_bins, labels=ar_labels, right=False)
ar_counts = df['AR_category'].value_counts().sort_index()
print("酰胺比例分布:")
for category, count in ar_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ar_counts.values)}")

print("\n" + "="*50)

# 8. Kier柔性指数 (Kier_index) 统计
print("8. Kier柔性指数 (Kier_index) 分布:")
ki_bins = [0, 4, 8, 12, 16, 20, 24, 28, float('inf')]
ki_labels = ['<4', '4-8', '8-12', '12-16', '16-20', '20-24', '24-28', '>28']
df['KI_category'] = pd.cut(df['Kier_index'], bins=ki_bins, labels=ki_labels, right=False)
ki_counts = df['KI_category'].value_counts().sort_index()
print("Kier柔性指数分布:")
for category, count in ki_counts.items():
    print(f"{category}: {count}")
print(f"JS数组格式: {list(ki_counts.values)}")

print("\n" + "="*50)
print("所有统计完成!")

# 汇总所有JS数组数据
print("\n" + "="*50)
print("汇总 - 所有JS更新数据:")
print(f"分子量: {list(mw_counts.values)}")
print(f"氢键受体: {hba_js_data}")
print(f"氢键供体: {hbd_js_data}")
print(f"cLogP: {list(clogp_counts.values)}")
print(f"TPSA: {list(tpsa_counts.values)}")
print(f"大环环大小: {mrs_js_data}")
print(f"酰胺比例: {list(ar_counts.values)}")
print(f"Kier指数: {list(ki_counts.values)}")

数据基本信息:
总记录数: 3724

1. 分子量 (Molecular_Weight) 分布:
分子量分布:
<200: 0
200-300: 3
300-400: 87
400-500: 1823
500-600: 1282
600-700: 290
700-800: 65
800-900: 87
900-1000: 53
>1000: 34
JS数组格式: [0, 3, 87, 1823, 1282, 290, 65, 87, 53, 34]

2. 氢键受体 (Num_H_Acceptors) 分布:
氢键受体分布:
2: 1
3: 171
4: 593
5: 859
6: 1035
7: 517
8: 230
9: 99
10: 47
11: 19
12: 45
13: 15
14: 22
15: 26
≥16: 45
JS数组格式: [1, 171, 593, 859, 1035, 517, 230, 99, 47, 19, 45, 15, 22, 26, 45]

3. 氢键供体 (Num_H_Donors) 分布:
氢键供体分布:
0: 846
1: 904
2: 880
3: 564
4: 302
5: 109
6: 84
7: 15
8: 14
9: 3
10: 1
≥12: 2
JS数组格式: [846, 904, 880, 564, 302, 109, 84, 15, 14, 3, 1, 0, 2]

4. cLogP 分布:
cLogP分布:
<-1: 35
-1-0: 66
0-1: 202
1-2: 475
2-3: 762
3-4: 1029
4-5: 751
5-6: 302
6-7: 78
>7: 24
JS数组格式: [35, 66, 202, 475, 762, 1029, 751, 302, 78, 24]

5. TPSA 分布:
TPSA分布:
<20: 0
20-50: 170
50-80: 1200
80-110: 999
110-140: 839
140-170: 265
170-200: 122
200-230: 45
230-260: 27
260-290: 16
>290: 41
JS数组格式: [0, 170, 1200, 999, 839, 265, 122, 45, 27, 16, 41]

6. 大