In [1]:
# module Import

# %matplotlib inline
import matplotlib.pyplot as plt # pip install matplotlib
from CIMtools.preprocessing import Fragmentor # pip install pillow==9.4.0
from CIMtools.preprocessing import solvent
from os import environ

## environ['PATH']+=":/home/pavel/envs/cgr/bin"
## needs to be reassigned if ISIDA descriptor generation is needed

from CGRtools import RDFRead, MoleculeContainer, ReactionContainer, SDFRead, SMILESRead, smiles, CGRContainer  # pip install CGRtools
import pandas as pd  # pip install pandas
from pandas import DataFrame 
import numpy as np
import pickle 

from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score, KFold, cross_val_predict, GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVR
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error as mae

from scipy.optimize import curve_fit

from ChemInfoTools.cheminfotools.chem_features import Augmentor, ComplexFragmentor, PassThrough, Pruner

import itertools
import xgboost # pip install xgboost
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor # pip install catboost

import torch
import torch.nn as nn
import torch.optim as optim

from rdkit import Chem
from rdkit.Chem import EState, Descriptors
from rdkit.Chem import GetPeriodicTable

%config IPCompleter.use_jedi = False
cm = 1./2.54

In [2]:
!pip install pubchempy




### Visualize

In [3]:
# VS plot
def vs_visualize(predicted_train, predicted_test, target_train, target_test, saveName=None):
    if isinstance(predicted_test, torch.Tensor):
        predicted_test = predicted_test.detach().cpu().numpy()
    if isinstance(target_test, torch.Tensor):
        target_test = target_test.detach().cpu().numpy()
    if isinstance(predicted_train, torch.Tensor):
        predicted_train = predicted_train.detach().cpu().numpy()
    if isinstance(target_train, torch.Tensor):
        target_train = target_train.detach().cpu().numpy()
    #그래프 생성 툴
    #실제값과 예측값 비교
    fig, box = plt.subplots(figsize=(4, 4), dpi=200, facecolor="w")

    # 테스트 데이터 포인트에 대한 주석 추가 
    for i, txt in enumerate(np.arange(len(target_test))):
        box.annotate(txt + 1, (predicted_test[i], target_test[i]), fontsize=6, ha='right')
    
    # 예측 지표 text 만들기
    rmse_test = np.sqrt(rmse(target_test, predicted_test)) #평균제곱오차의 제곱근: 오차의 크기
    mae_test = mae(target_test, predicted_test) #오차의 절대값 평균
    r2_test = r2(target_test, predicted_test) #R^2, 결정계수: 예측값과 실제값이 얼마나 잘 맞는지 나타냄.
    r2_train = r2(target_train, predicted_train)
    textstr = '\n'.join((
            'RMSE=%.3f' % rmse_test,
            'MAE=%.3f' % mae_test,
            'R2=%.3f' % r2_test,
            'R2(train)=%.3f' % r2_train
        ))
    
    # 데이터 포인트 표시 및 제목, 대각선 그리기
    global target
    box.plot(predicted_train, target_train, "ko", markersize=3)
    box.plot(predicted_test, target_test, "ro", markersize=3)
    plt_min = min(min(target_train), min(target_test))
    plt_max = max(max(target_train), max(target_test))
    box.plot([plt_min, plt_max], [plt_min, plt_max], color='red')  # 대각선 선 추가
    box.text(plt_min, plt_max, textstr, fontsize=7, 
        horizontalalignment='left', verticalalignment='top') # 예측 지표 텍스트 추가
    box.set_xlabel('Predicted ' + target)
    box.set_ylabel('Actual ' + target)
    plt.title(f'Pred vs. Actual {target}')
    plt.tight_layout()
    plt.show()
    if saveName:
        plt.savefig(f"VSplot_{target}_{saveName}")
        plt.close(fig)
    



In [4]:
def vs_visualize_4grid(predicted_train, predicted_test, target_train, target_test, boundary,saveName=None): #4분면으로 나누어 분석하는 기능을 추가함.
    fig, box = plt.subplots(figsize=(4, 4), dpi=200, facecolor="w")

    boundary = 90
    # 테스트 데이터 포인트에 대한 주석 추가 및 영역 나누기
    region=[0,0,0,0]
    print(region,len(target_test) )    
    for i, txt in enumerate(np.arange(len(target_test))):
        box.annotate(txt + 1, (predicted_test[i], target_test[i]), fontsize=6, ha='right')
        if predicted_test[i]<boundary:
            if target_test[i]<boundary:
                region[0]+=1
            else:   
                region[2]+=1
        else :
            if target_test[i]<boundary:
                region[1]+=1
            else:   
                region[3]+=1
    
    # 예측 지표 text 만들기
    rmse_test = np.sqrt(rmse(target_test, predicted_test))
    mae_test = mae(target_test, predicted_test)
    r2_test = r2(target_test, predicted_test)
    r2_train = r2(target_train, predicted_train)
    acc = div_accuracy(region)
    recall = div_recall(region)
    f1 = div_f1_score(region)
    
    textstr = '\n'.join((
            'R2(train)=%.3f' % r2_train,
            'Acc.=%f'% acc,
            'Recall=%f'% recall,
            'F1-score=%f'% f1
            
        ))
    
    # 데이터 포인트 표시 및 제목, 대각선 그리기
    global target
    box.plot(predicted_train, target_train, "ko", markersize=3)
    box.plot(predicted_test, target_test, "ro", markersize=3)
    plt_min = min(min(target_train), min(target_test))
    plt_max = max(max(target_train), max(target_test))
    box.plot([plt_min, plt_max], [plt_min, plt_max], color='red')  # 대각선 선 추가
    box.text(plt_min, plt_max, textstr, fontsize=7, 
        horizontalalignment='left', verticalalignment='top') # 예측 지표 텍스트 추가
    box.set_xlabel('Predicted ' + target)
    box.set_ylabel('Actual ' + target)
    plt.title(f'Pred vs. Actual {target}')
    plt.tight_layout()
    plt.show()
    if saveName:
        plt.savefig(f"VSplot_{target}_{saveName}")
        plt.close(fig)
    


# Data preparation

### Load Data

In [5]:
# Get Data from Excel  : 엑셀에서 데이터 불러오기
FileName = "final_data.csv"
data_molecules = pd.read_csv(FileName)
data_molecules.fillna('')
data_molecules = data_molecules.dropna()

In [6]:
data_molecules

Unnamed: 0,smiles,BBB
0,OCc1ccccc1,1
1,CC(NC(C)(C)C)C(=O)c1cccc(c1)Cl,1
2,NCCc1ccc(c(c1)O)O DOPAMINE,0
3,NC(=O)c1cccnc1,1
4,CN1CCCC1c1cccnc1,1
...,...,...
3045,CC(C)c1ccc(C)cc1OCC2=NCCN2,0
3046,CC[C@]1(O)C[C@H]2CN(CCc3c([nH]c4ccccc34)[C@@](...,0
3047,CC(C)C1OC(=O)C2=CCCN2C(=O)c3coc(CC(=O)CC(O)\C=...,0
3048,Oc1ccc(cc1)/C=C([N+]#[C-])/C(=C/c2ccc(O)cc2)[N...,0


### Drop NAs

In [7]:
# dropna with target : 예측하고자하는 결과값이 없는 행들을 제거
target = 'BBB' # ee, yield, ddG(free energy difference), regio 타겟 설정

print(len(data_molecules))
data_molecules.dropna(subset=[target], inplace=True)
print(len(data_molecules))


3050
3050


# Models

In [8]:
random_seed = 42

## Model pipelines (아래 3개의 파이프라인은 GNN과 무관함)

# Before Training

### data split

In [9]:
target_column = "BBB"
train_df = data_molecules.drop(columns=[target_column])
target_df = data_molecules[[target_column]]

In [10]:
#중복 여부 확인
smiles_column = train_df.columns[0]  # 첫 번째 열이 SMILES라고 가정
smiles_list = train_df[smiles_column]

# 중복 여부 확인
duplicate_smiles = smiles_list[smiles_list.duplicated(keep=False)]  # 중복된 값만 선택
duplicate_counts = duplicate_smiles.value_counts()  # 각 중복된 SMILES의 개수

# 결과 출력
if not duplicate_counts.empty:
    print(f"🔍 총 {len(duplicate_counts)}개의 중복된 SMILES가 발견되었습니다.")
    print("\n📌 중복된 SMILES 목록 (등장 횟수):")
    print(duplicate_counts)
else:
    print("✅ 중복된 SMILES가 없습니다.")

✅ 중복된 SMILES가 없습니다.


In [11]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import BRICS
from collections import Counter

# 데이터 불러오기
df = data_molecules.copy()
smiles_list = df['smiles'].tolist()
labels = df['BBB'].tolist()

# 각 SMILES에 대해 BRICS 분해 수행
def get_fragments(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return []
    try:
        frags = BRICS.BRICSDecompose(mol)
        return list(frags)
    except:
        return []

fragmented_data = [get_fragments(smiles) for smiles in smiles_list]

# 전체 조각 모음 (unique fragment set)
all_fragments = sorted(list(set([frag for frag_list in fragmented_data for frag in frag_list])))

# 조각 인덱스 매핑
frag2idx = {frag: i for i, frag in enumerate(all_fragments)}

# binary matrix 생성 (fragment 존재 여부)
import numpy as np
X = np.zeros((len(fragmented_data), len(all_fragments)))

for i, frag_list in enumerate(fragmented_data):
    for frag in frag_list:
        if frag in frag2idx:
            X[i, frag2idx[frag]] = 1

y = np.array(labels)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Feature importance 계산
importances = model.feature_importances_

# 중요도 순으로 정렬
frag_importances = [(all_fragments[i], importances[i]) for i in range(len(importances))]
frag_importances.sort(key=lambda x: x[1], reverse=True)

# 상위 N개 출력
print("Top 50 important fragments:")
for frag, score in frag_importances[:50]:
    print(f"{frag}: {score:.4f}")


[20:20:17] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:20:17] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:20:17] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:20:17] Explicit valence for atom # 5 N, 4, is greater than permitted


Top 50 important fragments:
[5*]N[5*]: 0.0432
[6*]C(=O)O: 0.0399
[13*][C@@H]1N2C(=O)[C@@H]([15*])[C@H]2SC1(C)C: 0.0289
[4*]CC1=C(C(=O)O)N2C(=O)[C@@H]([15*])[C@H]2SC1: 0.0168
[16*]c1ccc(O)cc1: 0.0137
[3*]O[3*]: 0.0114
[16*]c1ccc(O)c(O)c1: 0.0108
[3*]OC: 0.0103
[4*]CC(=O)O: 0.0100
[11*]S[11*]: 0.0091
[1*]C(=O)[C@@H]([8*])N: 0.0089
[1*]C(=O)C[8*]: 0.0084
[16*]c1ccc([16*])cc1: 0.0080
[4*]C[8*]: 0.0080
[16*]c1ccccc1: 0.0073
[1*]C(C)=O: 0.0072
[8*]CO: 0.0066
[8*][C@@H](C)O: 0.0065
[15*][C@@H]1C(=O)N2C(C(=O)O)=C(C)CS[C@H]12: 0.0065
[1*]C(=O)/C(=N\OC)c1csc(N)n1: 0.0064
[3*]O[C@H]1C[C@@]([15*])(O)Cc2c(O)c3c(c(O)c21)C(=O)c1c([16*])cccc1C3=O: 0.0055
[8*]C[8*]: 0.0055
[5*]N(C)C: 0.0053
[13*][C@H]1C[C@H](N)[C@H](O)[C@H](C)O1: 0.0052
[5*]N1CCN([5*])[C@@H]([13*])C1: 0.0049
[1*]C([6*])=O: 0.0048
[4*]CCC[4*]: 0.0047
[5*]N1CC[C@@H](O)C1: 0.0045
[1*]C(=O)C([8*])([8*])O: 0.0045
[4*][C@]1(C)C[C@H]([13*])O[C@@H](C)[C@@H]1O: 0.0044
[1*]C(=O)[C@H]([4*])[8*]: 0.0044
[1*]C(=O)C1=C(C)NC(C)=C(C([1*])=O)C1[15*]: 0

In [12]:
print(X.shape)

(3050, 2604)


In [13]:
# fragment별로 존재 여부에 따른 BBB 투과율 계산
frag_effects = []

for frag in all_fragments:
    idx = frag2idx[frag]
    has_frag = X[:, idx] == 1
    no_frag = X[:, idx] == 0
    
    if has_frag.sum() == 0 or no_frag.sum() == 0:
        continue  # 너무 희귀한 fragment는 제외
    
    rate_with = y[has_frag].mean() # 이 frag가 존재하는 경우 BBB 투과율
    rate_without = y[no_frag].mean() #이 frag가 부재하는 경우 BBB 투과율
    diff = rate_with - rate_without
    
    frag_effects.append((frag, diff, rate_with, rate_without))

# 영향력 높은 순 정렬
frag_effects.sort(key=lambda x: abs(x[1]), reverse=True)

# 예시 출력 (상위 10개)
for frag, diff, r_with, r_without in frag_effects[:10]:
    direction = "긍정적" if diff > 0 else "부정적"
    print(f"{frag}: 영향력 {direction} (투과율 차이 {diff:.3f}, with={r_with:.3f}, without={r_without:.3f})")


[13*][C@@H]1N2C(=O)[C@@H]([15*])[C@H]2SC1(C)C: 영향력 부정적 (투과율 차이 -0.695, with=0.000, without=0.695)
[4*]CC1=C(C(=O)O)N2C(=O)[C@@H]([15*])[C@H]2SC1: 영향력 부정적 (투과율 차이 -0.688, with=0.000, without=0.688)
[1*]C(=O)[C@@H]([8*])N: 영향력 부정적 (투과율 차이 -0.682, with=0.000, without=0.682)
[1*]C(=O)[C@H]([4*])[8*]: 영향력 부정적 (투과율 차이 -0.681, with=0.000, without=0.681)
[14*]c1nnnn1C: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[13*][C@H]1C[C@]([15*])(C)[C@@H](O)[C@H](C)O1: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[4*][C@]1(C)C[C@H]([13*])O[C@@H](C)[C@@H]1O: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[13*][C@@H]1O[C@H](C)C[C@H]([15*])[C@H]1O: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)
[13*]C1N2C(=O)C([15*])C2SC1(C)C: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)
[15*][C@@H]1C(=O)N2C(C(=O)O)=C(C)CS[C@H]12: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)


In [14]:
cnt1 = 0
positive_fragments = []
for frag, diff, r_with, r_without in frag_effects:
    if diff > 0 and r_with == 1: #이 frag들을 csv 파일로 저장
      direction = "긍정적"
      cnt1 += 1
      print(f"{frag}: 영향력 {direction} (투과율 차이 {diff:.3f}, with={r_with:.3f}, without={r_without:.3f})")
      positive_fragments.append(frag)


print(f"긍정적 영향력 있는 fragment 개수: {cnt1}")
print(f"긍정적 fragment 목록: {positive_fragments}")

[4*]CCC[4*]: 영향력 긍정적 (투과율 차이 0.333, with=1.000, without=0.667)
[5*]N1c2ccccc2Sc2ccc([16*])cc21: 영향력 긍정적 (투과율 차이 0.329, with=1.000, without=0.671)
[15*]C1([15*])C(=O)NC(=O)NC1=O: 영향력 긍정적 (투과율 차이 0.327, with=1.000, without=0.673)
[4*]CCC[7*]: 영향력 긍정적 (투과율 차이 0.327, with=1.000, without=0.673)
[13*][C@@H]1NCCC[C@@H]1[15*]: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[4*]CC(C)C[4*]: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[16*]c1cccc(Cl)c1: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[5*]N1CCOCC1: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[16*]c1cnc(C)nc1N: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[5*]N(C=O)C([7*])C: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[14*]c1ncccn1: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[4*]C1([15*])CCN([5*])CC1: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[8*]C([8*])C: 영향력 긍정적 (투과율 차이 0.326, with=1.000, without=0.674)
[16*]c1ccc2c(c1)OCO2: 영향력 긍정적 (투과율 차이 0.325, with=1.000, without=0.675)
[5*]N1c2ccccc

In [35]:
#같은 과정으로 negative fragment도 추출
cnt2 = 0
negative_fragments = []
for frag, diff, r_with, r_without in frag_effects:
    if diff<0 and r_with == 0: #이 frag들을 csv 파일로 저장
      direction = "부정적"
      cnt2 += 1
      print(f"{frag}: 영향력 {direction} (투과율 차이 {diff:.3f}, with={r_with:.3f}, without={r_without:.3f})")
      negative_fragments.append(frag)

print(f"부정적 영향력 있는 fragment 개수: {cnt2}")
print(f"부정적 fragment 목록: {negative_fragments}")

[13*][C@@H]1N2C(=O)[C@@H]([15*])[C@H]2SC1(C)C: 영향력 부정적 (투과율 차이 -0.695, with=0.000, without=0.695)
[4*]CC1=C(C(=O)O)N2C(=O)[C@@H]([15*])[C@H]2SC1: 영향력 부정적 (투과율 차이 -0.688, with=0.000, without=0.688)
[1*]C(=O)[C@@H]([8*])N: 영향력 부정적 (투과율 차이 -0.682, with=0.000, without=0.682)
[1*]C(=O)[C@H]([4*])[8*]: 영향력 부정적 (투과율 차이 -0.681, with=0.000, without=0.681)
[14*]c1nnnn1C: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[13*][C@H]1C[C@]([15*])(C)[C@@H](O)[C@H](C)O1: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[4*][C@]1(C)C[C@H]([13*])O[C@@H](C)[C@@H]1O: 영향력 부정적 (투과율 차이 -0.680, with=0.000, without=0.680)
[13*][C@@H]1O[C@H](C)C[C@H]([15*])[C@H]1O: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)
[13*]C1N2C(=O)C([15*])C2SC1(C)C: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)
[15*][C@@H]1C(=O)N2C(C(=O)O)=C(C)CS[C@H]12: 영향력 부정적 (투과율 차이 -0.679, with=0.000, without=0.679)
[8*][C@@H](C)O: 영향력 부정적 (투과율 차이 -0.678, with=0.000, without=0.678)
[14*]c1nnc(C)s1: 영향력 부정적 (투과율 차이 -0.678, with=0.000

In [30]:
def find_molecule_indices_with_fragment(fragment):
    idx = frag2idx.get(fragment)
    if idx is None:
        print(f"Fragment '{fragment}' not found.")
        return []
    
    # fragment가 존재하는 행(화합물) 인덱스 추출
    return list(np.where(X[:, idx] == 1)[0])

# 중요 fragment 10개를 찾아서 해당 fragment가 포함된 화합물의 인덱스를 출력

important_10 = ["[6*]C(=O)O", "[5*]N[5*]", "[13*][C@@H]1N2C(=O)[C@@H]([15*])[C@H]2SC1(C)C", "[4*]CC1=C(C(=O)O)N2C(=O)[C@@H]([15*])[C@H]2SC1", 
                "[1*]C(=O)[C@@H]([8*])N", "[11*]S[11*]", "[16*]c1ccc(O)c(O)c1", "[1*]C(=O)/C(=N\\OC)c1csc(N)n1", "[4*]CC(=O)O", "[16*]c1ccc(O)cc1"]

for frag in important_10:
    indices = find_molecule_indices_with_fragment(frag)
    print(f"\n▶ Fragment: {frag}")
    print(f"  포함된 화합물 인덱스: {indices}")




▶ Fragment: [6*]C(=O)O
  포함된 화합물 인덱스: [18, 26, 59, 89, 106, 161, 182, 194, 241, 249, 307, 321, 322, 326, 328, 369, 372, 458, 463, 488, 492, 599, 601, 620, 624, 633, 667, 731, 894, 927, 929, 945, 957, 969, 985, 995, 996, 1000, 1034, 1056, 1130, 1152, 1184, 1217, 1225, 1241, 1252, 1403, 1448, 1474, 1478, 1488, 1492, 1516, 1528, 1529, 1568, 1582, 1598, 1600, 1619, 1623, 1655, 1670, 1693, 1711, 1712, 1723, 1725, 1726, 1737, 1744, 1845, 1852, 1857, 1859, 1871, 1887, 2013, 2069, 2083, 2102, 2177, 2268, 2360, 2587, 2588, 2592, 2597, 2605, 2652, 2654, 2659, 2683, 2684, 2715, 2718, 2734, 2765, 2771, 2773, 2774, 2788, 2794, 2795, 2799, 2807, 2811, 2822, 2828, 2831, 2835, 2839, 2843, 2853, 2854, 2910, 2912, 2918, 2928, 2934, 2937, 2941, 2942, 2947, 2949, 2956, 2966, 2970, 2973, 2974, 2989, 2992, 2994, 2995, 2996, 2999, 3001, 3004, 3007, 3009, 3020, 3028, 3030, 3034, 3036, 3038]

▶ Fragment: [5*]N[5*]
  포함된 화합물 인덱스: [1, 10, 11, 12, 13, 21, 27, 37, 38, 44, 46, 49, 50, 56, 62, 65, 98, 121, 131, 142

### dummy atom제거

In [31]:
import re
def remove_brics_dummies(s):
    s = re.sub(r'\(\[\d{1,2}\*\]\)', '', s)
    s = re.sub(r'\[\d{1,2}\*\]', '', s)
    return s

#positive fragments에서 BRICS 더미 원자를 제거한 후 시각화
cleaned_pfragments = [remove_brics_dummies(frag) for frag in positive_fragments]
cleaned_pfragments = list(set(cleaned_pfragments))  # 중복 제거
cleaned_nfragments = [remove_brics_dummies(frag) for frag in negative_fragments]
cleaned_nfragments = list(set(cleaned_nfragments))  # 중복 제거


In [32]:
len(cleaned_pfragments), len(cleaned_nfragments)

(2155, 681)

In [33]:
def find_fragment_matches(smiles_list, fragment_smiles):
    """
    fragment_smiles를 포함하는 분자의 인덱스를 반환하는 함수

    Parameters:
    - smiles_list: list of str, 전체 분자의 SMILES 문자열 리스트
    - fragment_smiles: str, 찾고자 하는 fragment의 SMILES 또는 SMARTS

    Returns:
    - list of int: fragment를 포함한 분자의 인덱스 리스트
    """
    # fragment를 Mol 객체로 변환
    more_than_100_fragments = []
    for fragment in fragment_smiles:
        fragment = remove_brics_dummies(fragment)  # BRICS 더미 원자 제거
        frag_mol = Chem.MolFromSmarts(fragment)
        if frag_mol is None:
            raise ValueError(f"Invalid fragment SMILES/SMARTS: {fragment}")

        match_cnt = 0
        for idx, smi in enumerate(smiles_list):
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                continue  # invalid molecule, skip
            if mol.HasSubstructMatch(frag_mol):
                match_cnt += 1
        
        if match_cnt >= 100:
            more_than_100_fragments.append(fragment)

    return more_than_100_fragments

"""
more_than_100_pf = find_fragment_matches(data_molecules['smiles'].tolist(), cleaned_pfragments) # 26번째 fragment의 인덱스 찾기
mt100pf_df = pd.DataFrame(more_than_100_pf, columns=['Fragment'])
mt100pf_df.to_csv('cleaned_positive_fragments2.csv', index=False)
"""
more_than_100_nf = find_fragment_matches(data_molecules['smiles'].tolist(), cleaned_nfragments) # 26번째 fragment의 인덱스 찾기
mt100nf_df = pd.DataFrame(more_than_100_nf, columns=['Fragment'])
mt100nf_df.to_csv('cleaned_negative_fragments2.csv', index=False)



[20:27:31] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:27:31] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:31] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:31] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:32] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:27:33] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:27:33] Explicit valence for atom # 5 N, 4, 

In [None]:
stop


In [34]:
len(more_than_100_nf)

99

In [None]:
remove_brics_dummies(positive_fragments[26]) # 26번째 fragment의 BRICS 더미 원자 제거

'n1cnc2c(=O)[nH]c(N)nc21'

In [None]:
#csv 열기
import pandas as pd
fss_df = pd.read_csv('fg_sasa_summary.csv')
# 이 df에서 total_count가 100이상인 row만 남겨서 새로운 csv 파일로 저장
fss_df = fss_df[fss_df['total_count'] >= 100]
fss_df.to_csv('fg_sasa_summary_100.csv', index=False)

In [None]:
frag_mol = Chem.MolFromSmiles(positive_fragments[26], sanitize=False)
# frag_mol 시각화 하기
from rdkit.Chem import Draw
img = Draw.MolToImage(frag_mol, size=(300, 300))
img.show()

In [None]:
data_molecules['smiles'].tolist()


['OCc1ccccc1',
 'CC(NC(C)(C)C)C(=O)c1cccc(c1)Cl',
 'NCCc1ccc(c(c1)O)O DOPAMINE',
 'NC(=O)c1cccnc1',
 'CN1CCCC1c1cccnc1',
 '[N-]=[N+]=O',
 'CC12NC(Cc3ccccc13)c1ccccc21',
 'CCC12CCN(CC3(O)CC3)C(Cc3ccc(cc31)O)C2(C)C',
 'CN1CCN(CC1)C1Cc2ccccc2Sc2ccc(cc12)Cl',
 'O=C1NC(=O)C(N1)(c1ccccc1)c1ccccc1',
 'CCCC(=O)Nc1ccc(c(c1)C(C)=O)OCC(O)CNC(C)C',
 'CC(=O)Nc1ccc(cc1)O',
 'CC(=O)Nc1ccc(cc1)OC(=O)c1ccccc1O',
 'CC(=O)Nc1sc(nn1)S(N)(=O)=O',
 'CC(N)C(=O)OC(C)(C)Cc1ccc(cc1)Cl',
 'Cc1nnc2n1c1ccc(cc1C(=NC2)c1ccccc1)Cl',
 'OC(=O)CN1C(=O)c2cccc3cccc(c23)C1=O',
 'NC12CC3CC(CC(C3)C1)C2',
 'CCN1C=C(C(O)=O)C(=O)c2ccc(nc12)Cc1ccccc1',
 'CCC1(CCC(=O)NC1=O)c1ccc(cc1)N',
 'CCCCc1oc2ccccc2c1C(=O)c1cc(c(c(c1)[I])OCCN(CC)CC)[I]',
 'CCN1CCCC1CNC(=O)c1cc(c(cc1OC)N)S(=O)(=O)CC',
 'CN(C)CCC=C1c2ccccc2CCc2ccccc12',
 'CCC1(CCC(C)C)C(=O)NC(=O)NC1=O',
 'Clc1ccc2c(c1)C(=Nc1ccccc1O2)N1CCNCC1',
 'COc1ccc(cc1)C(=O)N1CCCC1=O',
 'CC(=O)Oc1ccccc1C(O)=O',
 'COc1ccc(cc1)CCN1CCC(CC1)Nc1nc2ccccc2n1Cc1ccc(cc1)F',
 'CCC1(CC)C(=O)NC(=O)NC

In [None]:
#smiles로 표현된 분자를 시각화 (하나만)
def visualize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print("Invalid SMILES:", smiles)
        return

    img = Draw.MolToImage(mol, size=(300, 300))
    img.show()

# 예시로 첫 번째 SMILES 시각화
visualize_smiles("OCCOP(=O)(OCCN)OC(COCCOCCON)CO")