0. binds=1과 동일한 숫자로 랜덤추출
다음은 8:2로 추출해보자

In [4]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
from tqdm import tqdm
import random

def query_and_save_parquet(train_path, output_path):
    # DuckDB 연결
    con = duckdb.connect()

    # 데이터 쿼리 실행
    df = con.execute(f"""
    (SELECT * FROM parquet_scan('{train_path}') WHERE binds = 1 ORDER BY RANDOM() LIMIT 1589906)
    """).fetchdf()

    # 데이터 쿼리 실행
    # df = con.execute(f"""
    # (SELECT * FROM parquet_scan('{train_path}') WHERE binds = 0 ORDER BY RANDOM() LIMIT 1589906)
    # """).fetchdf()

#                         
    
    # DataFrame을 Parquet 파일로 저장
    table = pa.Table.from_pandas(df)
    pq.write_table(table, output_path)
    print(f"Data saved to {output_path}")
    
    # 메모리 해제
    del df
    gc.collect()

# 파일 경로 및 설정
train_path = './train.parquet'
output_parquet_path = './queried_data.parquet'

# 데이터 쿼리 및 저장 실행
query_and_save_parquet(train_path, output_parquet_path)

Data saved to ./queried_data.parquet


1. bind=0을 추출

In [3]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
import os

def query_and_save_chunks(train_path, output_prefix, total_records=1589906, chunk_size=100000):
    # DuckDB 연결
    con = duckdb.connect()
    
    remaining_records = total_records
    offset = 0
    chunk_id = 1
    
    while remaining_records > 0:
        fetch_size = min(chunk_size, remaining_records)
        
        # 데이터 쿼리
        df = con.execute(f"""
        SELECT * FROM parquet_scan('{train_path}') WHERE binds = 0 ORDER BY RANDOM() LIMIT {fetch_size} OFFSET {offset}
        """).fetchdf()
        
        if df.empty:
            break
        
        # 청크 파일로 저장
        chunk_path = f"{output_prefix}_chunk_{chunk_id}.parquet"
        table = pa.Table.from_pandas(df)
        pq.write_table(table, chunk_path)
        print(f"Data saved to {chunk_path}, fetched: {fetch_size}, remaining: {remaining_records}")
        
        # 메모리 해제
        del df, table
        gc.collect()
        
        offset += fetch_size
        remaining_records -= fetch_size
        chunk_id += 1

# 파일 경로 및 설정
train_path = './train.parquet'
output_prefix = './0_queried_data'
final_output_path = './final_queried_data.parquet'
total_records = 1589906
chunk_size = 100000

# 데이터 쿼리 및 청크 저장 실행
query_and_save_chunks(train_path, output_prefix, total_records, chunk_size)

# 청크 파일 병합 실행
# total_chunks = (total_records + chunk_size - 1) // chunk_size
# merge_chunks(output_prefix, final_output_path, total_chunks)

# print("Data querying and merging completed.")


Data saved to ./0_queried_data_chunk_1.parquet, fetched: 100000, remaining: 1589906
Data saved to ./0_queried_data_chunk_2.parquet, fetched: 100000, remaining: 1489906
Data saved to ./0_queried_data_chunk_3.parquet, fetched: 100000, remaining: 1389906
Data saved to ./0_queried_data_chunk_4.parquet, fetched: 100000, remaining: 1289906
Data saved to ./0_queried_data_chunk_5.parquet, fetched: 100000, remaining: 1189906
Data saved to ./0_queried_data_chunk_6.parquet, fetched: 100000, remaining: 1089906


RuntimeError: Query interrupted

binds=0을 병합

In [3]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
import os

def merge_chunks(output_prefix, final_output_path, total_chunks):
    schema = None
    unique_data = pd.DataFrame()
    
    for chunk_id in range(total_chunks):
        chunk_path = f"{output_prefix}_chunk_{chunk_id}.parquet"
        table = pq.read_table(chunk_path)
        df = table.to_pandas()
        
        # 중복 제거 및 데이터 합치기
        unique_data = pd.concat([unique_data, df]).drop_duplicates().reset_index(drop=True)
        
        # 메모리 해제 및 청크 파일 삭제
        del table, df
        os.remove(chunk_path)
        gc.collect()
        print(f"Chunk {chunk_id} processed and merged.")
    
    # 최종 파일로 저장
    final_table = pa.Table.from_pandas(unique_data)
    pq.write_table(final_table, final_output_path)
    print(f"Final data saved to {final_output_path}")
    
    # 메모리 해제
    del unique_data, final_table
    gc.collect()

# 파일 경로 및 설정
train_path = './train.parquet'
output_prefix = './0_queried_data'
final_output_path = './final_queried_data3.parquet'
total_records = 1600000#1589906
chunk_size = 100000

# 청크 파일 병합 실행
total_chunks = (total_records + chunk_size - 1) // chunk_size
merge_chunks(output_prefix, final_output_path, total_chunks)

print("Data querying and merging completed.")


Chunk 0 processed and merged.
Chunk 1 processed and merged.
Chunk 2 processed and merged.
Chunk 3 processed and merged.
Chunk 4 processed and merged.
Chunk 5 processed and merged.
Chunk 6 processed and merged.
Chunk 7 processed and merged.
Chunk 8 processed and merged.
Chunk 9 processed and merged.
Chunk 10 processed and merged.
Chunk 11 processed and merged.
Chunk 12 processed and merged.
Chunk 13 processed and merged.
Chunk 14 processed and merged.
Chunk 15 processed and merged.
Final data saved to ./final_queried_data2.parquet
Data querying and merging completed.


binds=0과 binds=1을 합치고 순서를 다 섞기

In [4]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc

def merge_and_shuffle_parquets(input_path1, input_path2, output_path):
    # 첫 번째 Parquet 파일 읽기
    table1 = pq.read_table(input_path1)
    df1 = table1.to_pandas()
    
    # 두 번째 Parquet 파일 읽기
    table2 = pq.read_table(input_path2)
    df2 = table2.to_pandas()
    
    # 데이터 프레임 병합
    combined_df = pd.concat([df1, df2], ignore_index=True)
    
    # 데이터 프레임 셔플
    shuffled_df = combined_df.sample(frac=1).reset_index(drop=True)
    
    # 결과를 Parquet 파일로 저장
    shuffled_table = pa.Table.from_pandas(shuffled_df)
    pq.write_table(shuffled_table, output_path)
    
    print(f"Data from {input_path1} and {input_path2} merged and shuffled, saved to {output_path}")
    
    # 메모리 해제
    del df1, df2, combined_df, shuffled_df, table1, table2, shuffled_table
    gc.collect()

# 파일 경로 설정
input_path1 = './final_queried_data2.parquet'
input_path2 = './merged_shuffled_data.parquet'#'./queried_data.parquet'
output_path = './merged_shuffled_data2.parquet'

# 데이터 병합 및 셔플 실행
merge_and_shuffle_parquets(input_path1, input_path2, output_path)

print("Data merging and shuffling completed.")


Data from ./final_queried_data2.parquet and ./merged_shuffled_data.parquet merged and shuffled, saved to ./merged_shuffled_data2.parquet
Data merging and shuffling completed.


잘 섞였는지 검토

In [14]:
import duckdb
con = duckdb.connect()

# df = con.query(f"""
# SELECT * FROM parquet_scan('./merged_shuffled_data.parquet') LIMIT 1000       
# """).df()

df = con.query(f"""
SELECT * FROM parquet_scan('./train_enc_BRD4.parquet') LIMIT 100       
""").df()

df


Unnamed: 0,enc0,enc1,enc2,enc3,enc4,enc5,enc6,enc7,enc8,enc9,...,enc133,enc134,enc135,enc136,enc137,enc138,enc139,enc140,enc141,bind
0,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
1,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
2,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
3,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
4,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
96,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
97,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0
98,8,22,8,8,28,12,27,12,12,12,...,0,0,0,0,0,0,0,0,0,0


몇대 몇인지 비율 검토

In [4]:
import duckdb
con = duckdb.connect()

# 전체 데이터 중에서 binds=1 및 binds=0의 갯수를 계산
query = """
SELECT 
    SUM(CASE WHEN binds = 1 THEN 1 ELSE 0 END) AS binds_1_count,
    SUM(CASE WHEN binds = 0 THEN 1 ELSE 0 END) AS binds_0_count,
    COUNT(*) AS total_count
FROM parquet_scan('./merged_shuffled_data.parquet')
"""

df = con.execute(query).df()

# 전체 행의 수를 사용하여 비율 계산
df['ratio_binds_1'] = df['binds_1_count'] / df['total_count']
df['ratio_binds_0'] = df['binds_0_count'] / df['total_count']

df


Unnamed: 0,binds_1_count,binds_0_count,total_count,ratio_binds_1,ratio_binds_0
0,1589906.0,3191877.0,4781783,0.332492,0.667508


ecfp 적용 (id와 protein_name 검토할 것)

In [11]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import gc
from tqdm import tqdm

# ECFP 생성 함수
def generate_ecfp(smiles, radius=2, bits=1024):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return [0] * bits
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

def process_batch(df):
    # 각 SMILES 열에 대해 ECFP 생성
    df['molecule_ecfp'] = df['molecule_smiles'].apply(generate_ecfp)
    df['buildingblock1_ecfp'] = df['buildingblock1_smiles'].apply(generate_ecfp)
    df['buildingblock2_ecfp'] = df['buildingblock2_smiles'].apply(generate_ecfp)
    df['buildingblock3_ecfp'] = df['buildingblock3_smiles'].apply(generate_ecfp)
    
    # 필요한 열만 포함된 DataFrame 반환
    return df[['id', 'protein_name', 'molecule_ecfp', 'buildingblock1_ecfp', 'buildingblock2_ecfp', 'buildingblock3_ecfp', 'binds']]

def preprocess_and_save_ecfp(input_path, output_path, batch_size=32768):
    reader = pq.ParquetFile(input_path)
    
    # 적절한 스키마로 Parquet writer 초기화
    schema = pa.schema([
        ('id', pa.int32()),
        ('molecule_ecfp', pa.list_(pa.int32())),
        ('buildingblock1_ecfp', pa.list_(pa.int32())),
        ('buildingblock2_ecfp', pa.list_(pa.int32())),
        ('buildingblock3_ecfp', pa.list_(pa.int32())),
        ('protein_name', pa.string()),
        ('binds', pa.int32())  # test할 때는 제외
    ])
    
    with pq.ParquetWriter(output_path, schema) as writer:
        total_batches = reader.metadata.num_row_groups
        
        with tqdm(total=total_batches, desc="Processing", unit="batch", leave=True) as pbar:
            for batch in reader.iter_batches(batch_size=batch_size):
                df_batch = batch.to_pandas()
                
                processed_batch = process_batch(df_batch)
                
                # 처리된 DataFrame을 Arrow Table로 변환하여 파일에 작성
                table = pa.Table.from_pandas(processed_batch, schema=schema)
                writer.write_table(table)
                
                # 진행 상황 업데이트
                pbar.update(1)

                # 주기적으로 가비지 컬렉션 호출
                gc.collect()

# 파일 경로 설정
input_parquet_path = './merged_shuffled_data.parquet'
output_parquet_path = './processed_merged_queried_data.parquet'
# 테스트 파일 경로
# input_parquet_path = './test.parquet'
# output_parquet_path = './test_processed_data.parquet'

# 데이터 전처리 및 저장 실행
preprocess_and_save_ecfp(input_parquet_path, output_parquet_path)

print("ECFP generation and saving completed.")


Processing: 146batch [2:11:32, 54.06s/batch]                


ECFP generation and saving completed.


train 진행

In [4]:
# import duckdb
# import pandas as pd
# import numpy as np
# from rdkit import Chem
# from rdkit.Chem import AllChem
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score
# from sklearn.ensemble import RandomForestClassifier
# import gc
# import joblib
# import cupy as cp
# from tqdm import tqdm
# import psutil
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.losses import BinaryCrossentropy
# from tensorflow.keras.regularizers import L1, L2, L1L2
# from xgboost import XGBClassifier
# from sklearn.model_selection import KFold

# # Check if TensorFlow is using the GPU
# physical_devices = tf.config.list_physical_devices('GPU')
# if physical_devices:
#     try:
#         tf.config.experimental.set_memory_growth(physical_devices[0], True)
#         print("TensorFlow GPU memory growth enabled")
#     except RuntimeError as e:
#         print(e)
# else:
#     print("TensorFlow GPU not available")

# # 데이터 로드 및 전처리
# input_parquet_path = 'processed_merged_queried_data.parquet'

# con = duckdb.connect()

# # GPU 설정을 위한 XGBClassifier 옵션
# xgb_params = {
#     'n_estimators': 100,
#     'device': 'cuda',
#     'eta': 0.1,
#     'max_depth': 14,
#     'updater': 'grow_gpu_hist',
#     'refresh_leaf': 1,
#     'process_type': 'default',
#     'use_label_encoder': False,
#     'objective': 'binary:logistic', #'rank:map', 
#     'eval_metric': 'error'#'auc' #'map'
# }

# # RandomForestClassifier 옵션
# rf_params = {
#     'n_estimators': 100,
#     'n_jobs': -1
# }

# # 메모리 정리 함수
# def clear_memory():
#     gc.collect()
#     print("Memory cleared.")

# # 모델 학습 및 평가 함수 정의
# def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, iteration):
#     X_train_gpu = cp.array(X_train)
#     X_test_gpu = cp.array(X_test)
#     model.fit(X_train_gpu, y_train)
#     y_pred_proba = model.predict_proba(X_test_gpu)[:, 1]
#     y_pred = model.predict(X_test_gpu)
#     y_pred_proba_np = cp.asnumpy(y_pred_proba)
#     y_pred_np = cp.asnumpy(y_pred)

#     map_score = average_precision_score(y_test, y_pred_proba_np)
#     accuracy = accuracy_score(y_test, y_pred_np)
#     precision = precision_score(y_test, y_pred_np)
#     recall = recall_score(y_test, y_pred_np)
#     f1 = f1_score(y_test, y_pred_np)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     # Delete GPU arrays to free up memory
#     del X_train_gpu, X_test_gpu, y_pred_proba, y_pred
#     cp._default_memory_pool.free_all_blocks()
#     clear_memory()
#     return model

# def train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, model_name, iteration):
#     # CuPy 배열을 NumPy 배열로 변환
#     X_train_np = cp.asnumpy(X_train)
#     X_test_np = cp.asnumpy(X_test)

#     model.fit(X_train_np, y_train)
#     y_pred_proba = model.predict_proba(X_test_np)[:, 1]
#     y_pred = model.predict(X_test_np)
#     y_pred_proba_np = cp.asnumpy(y_pred_proba)
#     y_pred_np = cp.asnumpy(y_pred)

#     map_score = average_precision_score(y_test, y_pred_proba_np)
#     accuracy = accuracy_score(y_test, y_pred_np)
#     precision = precision_score(y_test, y_pred_np)
#     recall = recall_score(y_test, y_pred_np)
#     f1 = f1_score(y_test, y_pred_np)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     # Delete GPU arrays to free up memory
#     del X_train_np, X_test_np, y_pred_proba, y_pred
#     cp._default_memory_pool.free_all_blocks()
#     clear_memory()
#     return model

# # LSTM 모델 학습 및 평가 함수 정의 (TensorFlow)
# def train_and_evaluate_lstm_tf(model, X_train, X_test, y_train, y_test, model_name, iteration, batch_size, num_epochs):
#     # TensorFlow 텐서로 변환
#     X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
#     X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
#     y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
#     y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

#     # 모델 컴파일
#     model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(),
#                   metrics=[tf.keras.metrics.Recall()])

#     # 학습
#     model.fit(X_train_tensor, y_train_tensor, epochs=num_epochs, batch_size=batch_size, verbose=1)

#     # 예측
#     y_pred_proba = model.predict(X_test_tensor)
#     y_pred = (y_pred_proba >= 0.5).astype(int)  # 정밀도 수정함!!!!!!!!!!!! 기본이 0.5

#     map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
#     accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
#     precision = precision_score(y_test_tensor.numpy(), y_pred)
#     recall = recall_score(y_test_tensor.numpy(), y_pred)
#     f1 = f1_score(y_test_tensor.numpy(), y_pred)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     clear_memory()
#     return model, precision, recall, f1, map_score

# protein_names = ['BRD4', 'HSA', 'sEH']
# # protein_names = ['HSA']  # 'BRD4', 'HSA', 'sEH'
# # 각 protein에 대한 최적의 조합을 설정
# best_combinations = {
#     'BRD4': [('elu', 'relu')],  # 확정
#     'HSA': [('relu', 'elu')],
# #    'HSA': [('elu', 'relu')],
#     # 'HSA': [('elu','tanh')]
#     'sEH': [('elu', 'relu')]  # 확정
# }

# # 사용할 모델 종류 지정: 'lstm'
# model_types = ['lstm']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
# #model_types = ['randomforest']  # 여기서 원하는 모델 종류를 리스트로 선택하세요 # randomforest
# #model_types = ['xgboost']  # 여기서 원하는 모델 종류를 리스트로 선택하세요 # randomforest

# num_iterations = 1

# # Precision 기록을 위한 리스트
# precision_records = []
# total_records = []

# # RAM usage monitoring
# def monitor_memory(threshold=0.80):
#     memory_info = psutil.virtual_memory()
#     return memory_info.percent / 100 >= threshold

# for protein in protein_names:
#     # 전체 행 수 계산
#     total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
#     print(total_rows)
#     # 배치 크기 설정
#     batch_size = 100000  # 적절한 배치 크기로 설정

#     # 데이터 배치로 읽어와서 처리
#     for offset in range(0, total_rows, batch_size):
#         filtered_df = con.execute(f"""
#         SELECT * FROM parquet_scan('{input_parquet_path}') 
#         WHERE protein_name = '{protein}'
#         LIMIT {batch_size} OFFSET {offset}
#         """).df()

#         print(f"Processing {protein} batch starting at offset {offset}...")

#         X = np.concatenate([
#             np.array(filtered_df['molecule_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
#         ], axis=1)
#         y = filtered_df['binds'].tolist()

#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#         print(f"Processing {protein} train test split done...")

#         for model_type in model_types:
#             print(f"Processing {protein} with {model_type.upper()}...")
#             if model_type == 'lstm':
#                 for activation_1, activation_2 in best_combinations[protein]:
#                     input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
#                     hidden_dim = 128
#                     output_dim = 1
#                     num_layers = 2
#                     num_epochs = 5
#                     batch_size = 1024  # 2048

#                     model = Sequential([
# #                        Input(shape=(input_dim)),
#                         Input(shape=(None, input_dim)),  # 여기를 수정했습니다.
#                         LSTM(hidden_dim*8, return_sequences=True, activation=activation_1, dropout=0.05),
#                         LSTM(hidden_dim*4, return_sequences=True, activation=activation_2, dropout=0.05),
#                         LSTM(hidden_dim*1, return_sequences=False, activation=activation_1, dropout=0.05),
#                         Dense(output_dim, activation='sigmoid')
#                     ])

#             # if model_type == 'lstm':
#             #     for activation_1, activation_2 in best_combinations[protein]:
#             #         input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
#             #         hidden_dim = 128
#             #         output_dim = 1
#             #         num_layers = 2
#             #         num_epochs = 1
#             #         batch_size = 1024  # 2048

#             #         model = Sequential([
#             #             Input(shape=(1, input_dim)),
#             #             LSTM(hidden_dim, return_sequences=True, activation=activation_1, dropout=0.1),
#             #             LSTM(hidden_dim, return_sequences=False, activation=activation_2, dropout=0.1),
#             #             Dense(output_dim, activation='sigmoid')
#             #         ])
#                     model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"

#                     # LSTM 모델 로드 또는 초기화
#                     try:
#                         model.load_weights(model_filename)
#                         print(f"Loaded existing LSTM model for {protein} with {activation_1} and {activation_2}")
#                     except FileNotFoundError:
#                         print(f"Initialized new LSTM model for {protein} with {activation_1} and {activation_2}")

#                     for i in range(num_iterations):
#                         print(f"Iteration {i + 1} for {protein} with LSTM batch {offset}...")

#                         X_train_lstm = np.expand_dims(X_train, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
#                         X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
#                         model, precision, recall, f1, mAP = train_and_evaluate_lstm_tf(model, X_train_lstm, X_test_lstm, y_train, y_test, f"{protein} - LSTM", i + 1, batch_size, num_epochs)

#                         # Precision 기록
#                         precision_records.append((protein, activation_1, activation_2, precision, recall, f1, mAP, offset))

#                         if monitor_memory():
#                             print("Memory usage exceeded threshold. Clearing memory...")
#                             clear_memory()

#                         # 모델 저장
#                         model.save_weights(model_filename)
#                         print(f"Model saved: {model_filename}")
#                 total_records.extend(precision_records)
#                 precision_records = []

#             else:
#                 if model_type == 'xgboost':
#                     model = XGBClassifier(**xgb_params)  # 초기 XGB 모델 생성
#                     model_filename = f"{protein}_xgb_model_iteration_1.pkl"
#                 elif model_type == 'randomforest':
#                     model = RandomForestClassifier(**rf_params)  # 초기 RF 모델 생성
#                     model_filename = f"{protein}_rf_model_iteration_1.pkl"

#                 # 모델 로드 또는 초기화
#                 try:
#                     model = joblib.load(model_filename)
#                     print(f"Loaded existing {model_type.upper()} model for {protein}")
#                 except FileNotFoundError:
#                     print(f"Initialized new {model_type.upper()} model for {protein}")

#                 for i in range(num_iterations):
#                     print(f"Iteration {i + 1} for {protein} with {model_type.upper()} batch {offset}...")

#                     if model_type == 'xgboost':
#                         model = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, f"{protein} - {model_type.upper()}", i + 1)
#                     elif model_type == 'randomforest':
#                         model = train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, f"{protein} - {model_type.upper()}", i + 1)

#                     if monitor_memory():
#                         print("Memory usage exceeded threshold. Clearing memory...")
#                         clear_memory()

#                     # 모델 저장
#                     joblib.dump(model, model_filename)
#                     print(f"Model saved: {model_filename}")

#             # 각 best_combinations마다 average precision과 average recall 계산 및 출력 (LSTM 모델에 대해서만)
#             if model_type == 'lstm':
#                 precision_df = pd.DataFrame(total_records, columns=['Protein', 'Activation 1', 'Activation 2', 'Precision', 'Recall', 'f1-score', 'mAP', 'batch_num'])
#                 avg_precision = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['Precision'].mean().reset_index()
#                 avg_recall = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['Recall'].mean().reset_index()
#                 avg_f1 = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['f1-score'].mean().reset_index()

#                 print(f"Average Recall by Activation Function Combinations (Batch {offset}):")
#                 print(avg_recall)
#                 print(f"Average Precision by Activation Function Combinations (Batch {offset}):")
#                 print(avg_precision)
#                 print(f"Average f1-score by Activation Function Combinations (Batch {offset}):")
#                 print(avg_f1)
# con.close()

# # 모든 기록을 데이터프레임으로 변환하여 저장
# results_df = pd.DataFrame(total_records, columns=['Protein', 'Activation 1', 'Activation 2', 'Precision', 'Recall', 'f1-score', 'mAP', 'batch_num'])
# results_df.to_csv('total_precision_records.csv', index=False)


TensorFlow GPU memory growth enabled
1521460
Processing BRD4 batch starting at offset 0...
Processing BRD4 train test split done...
Processing BRD4 with LSTM...
Initialized new LSTM model for BRD4 with elu and relu
Iteration 1 for BRD4 with LSTM batch 0...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
BRD4 - LSTM - Iteration 1 - Mean Average Precision (mAP): 0.83430
BRD4 - LSTM - Iteration 1 - Precision: 0.89162
BRD4 - LSTM - Iteration 1 - Recall: 0.90364
BRD4 - LSTM - Iteration 1 - F1 Score: 0.89759
Memory cleared.
Model saved: BRD4_lstm_model_elu_relu_elu_iteration_1.weights.h5
Average Recall by Activation Function Combinations (Batch 0):
  Protein Activation 1 Activation 2    Recall
0    BRD4          elu         relu  0.903639
Average Precision by Activation Function Combinations (Batch 0):
  Protein Activation 1 Activation 2  Precision
0    BRD4          elu         relu   0.891622
Average f1-score by Activation Function Combinations (Batch 0):
  Protein Activation 1 Activatio

catboost 추가한 학습

In [39]:
# import duckdb
# import pandas as pd
# import numpy as np
# import joblib
# import cupy as cp
# import gc
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Input
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from catboost import CatBoostClassifier
# from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score

# # GPU 설정 확인 및 메모리 사용량 조절
# physical_devices = tf.config.list_physical_devices('GPU')
# if physical_devices:
#     try:
#         tf.config.experimental.set_memory_growth(physical_devices[0], True)
#         print("TensorFlow GPU memory growth enabled")
#     except RuntimeError as e:
#         print(e)
# else:
#     print("TensorFlow GPU not available")

# # 데이터 로드 및 전처리
# input_parquet_path = 'processed_merged_queried_data.parquet'

# con = duckdb.connect()

# # GPU 설정을 위한 XGBClassifier 옵션
# xgb_params = {
#     'n_estimators': 100,
#     'device': 'cuda',
#     'eta': 0.1,
#     'max_depth': 14,
#     'updater': 'grow_gpu_hist',
#     'refresh_leaf': 1,
#     'process_type': 'default',
#     'use_label_encoder': False,
#     'objective': 'binary:logistic', #'rank:map', 
#     'eval_metric': 'error'#'auc' #'map'
# }

# # RandomForestClassifier 옵션
# rf_params = {
#     'n_estimators': 100,
#     'n_jobs': -1
# }

# # CatBoostClassifier 옵션
# catboost_params = {
#     'iterations': 1000,
#     'learning_rate': 0.1,
#     'depth': 10,
#     'task_type': 'GPU',
#     'verbose': 100
# }

# # 메모리 정리 함수
# def clear_memory():
#     gc.collect()
#     print("Memory cleared.")

# # 모델 학습 및 평가 함수 정의
# def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, iteration, model_type):
#     if model_type == 'catboost':
#         X_train = cp.asnumpy(X_train)
#         X_test = cp.asnumpy(X_test)
#     else:
#         X_train = cp.asnumpy(X_train)
#         X_test = cp.asnumpy(X_test)

#     model.fit(X_train, y_train)
#     y_pred_proba = model.predict_proba(X_test)[:, 1]
#     y_pred = model.predict(X_test)
    
#     map_score = average_precision_score(y_test, y_pred_proba)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     clear_memory()
#     return model

# def train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, model_name, iteration):
#     # CuPy 배열을 NumPy 배열로 변환
#     X_train_np = cp.asnumpy(X_train)
#     X_test_np = cp.asnumpy(X_test)

#     model.fit(X_train_np, y_train)
#     y_pred_proba = model.predict_proba(X_test_np)[:, 1]
#     y_pred = model.predict(X_test_np)
#     y_pred_proba_np = cp.asnumpy(y_pred_proba)
#     y_pred_np = cp.asnumpy(y_pred)

#     map_score = average_precision_score(y_test, y_pred_proba_np)
#     accuracy = accuracy_score(y_test, y_pred_np)
#     precision = precision_score(y_test, y_pred_np)
#     recall = recall_score(y_test, y_pred_np)
#     f1 = f1_score(y_test, y_pred_np)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     # Delete GPU arrays to free up memory
#     del X_train_np, X_test_np, y_pred_proba, y_pred
#     cp._default_memory_pool.free_all_blocks()
#     clear_memory()
#     return model

# # LSTM 모델 학습 및 평가 함수 정의 (TensorFlow)
# def train_and_evaluate_lstm_tf(model, X_train, X_test, y_train, y_test, model_name, iteration, batch_size, num_epochs):
#     # TensorFlow 텐서로 변환
#     X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
#     X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
#     y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
#     y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

#     # 모델 컴파일
#     model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(),
#                   metrics=[tf.keras.metrics.Recall()])

#     # 학습
#     model.fit(X_train_tensor, y_train_tensor, epochs=num_epochs, batch_size=batch_size, verbose=1)

#     # 예측
#     y_pred_proba = model.predict(X_test_tensor)
#     y_pred = (y_pred_proba >= 0.5).astype(int)  # 정밀도 수정함!!!!!!!!!!!! 기본이 0.5

#     map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
#     accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
#     precision = precision_score(y_test_tensor.numpy(), y_pred)
#     recall = recall_score(y_test_tensor.numpy(), y_pred)
#     f1 = f1_score(y_test_tensor.numpy(), y_pred)
#     print(f"{model_name} - Iteration {iteration} - Mean Average Precision (mAP): {map_score:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Precision: {precision:.5f}")
#     print(f"{model_name} - Iteration {iteration} - Recall: {recall:.5f}")
#     print(f"{model_name} - Iteration {iteration} - F1 Score: {f1:.5f}")

#     clear_memory()
#     return model, precision, recall, f1, map_score

# def get_model(model_type):
#     if model_type == 'random_forest':
#         return RandomForestClassifier(**rf_params)
#     elif model_type == 'xgboost':
#         return XGBClassifier(**xgb_params)
#     elif model_type == 'catboost':
#         return CatBoostClassifier(**catboost_params)
#     else:
#         raise ValueError(f"Unsupported model type: {model_type}")

# protein_names = ['BRD4', 'HSA', 'sEH']
# # protein_names = ['HSA']  # 'BRD4', 'HSA', 'sEH'
# # 각 protein에 대한 최적의 조합을 설정
# best_combinations = {
#     'BRD4': [('elu', 'relu')],  # 확정
#     'HSA': [('relu', 'elu')],
# #    'HSA': [('elu', 'relu')],
#     # 'HSA': [('elu','tanh')]
#     'sEH': [('elu', 'relu')]  # 확정
# }

# # 사용할 모델 종류 지정: 'lstm', 'randomforest', 'xgboost', 'catboost'
# #model_types = ['lstm', 'randomforest', 'xgboost', 'catboost']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
# model_types = ['catboost']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
# num_iterations = 1

# # Precision 기록을 위한 리스트
# precision_records = []
# total_records = []

# # RAM usage monitoring
# def monitor_memory(threshold=0.80):
#     memory_info = psutil.virtual_memory()
#     return memory_info.percent / 100 >= threshold

# for protein in protein_names:
#     # 전체 행 수 계산
#     total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
#     print(total_rows)
#     # 배치 크기 설정
#     batch_size = 100000  # 적절한 배치 크기로 설정

#     # 데이터 배치로 읽어와서 처리
#     for offset in range(0, total_rows, batch_size):
#         filtered_df = con.execute(f"""
#         SELECT * FROM parquet_scan('{input_parquet_path}') 
#         WHERE protein_name = '{protein}'
#         LIMIT {batch_size} OFFSET {offset}
#         """).df()

#         print(f"Processing {protein} batch starting at offset {offset}...")

#         X = np.concatenate([
#             np.array(filtered_df['molecule_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
#             np.array(filtered_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
#         ], axis=1)
#         y = filtered_df['binds'].tolist()

#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#         print(f"Processing {protein} train test split done...")

#         for model_type in model_types:
#             print(f"Processing {protein} with {model_type.upper()}...")
#             if model_type == 'lstm':
#                 for activation_1, activation_2 in best_combinations[protein]:
#                     input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
#                     hidden_dim = 128
#                     output_dim = 1
#                     num_layers = 2
#                     num_epochs = 5
#                     batch_size = 1024  # 2048

#                     model = Sequential([
#                         Input(shape=(None, input_dim)),  # 여기를 수정했습니다.
#                         LSTM(hidden_dim*8, return_sequences=True, activation=activation_1, dropout=0.05),
#                         LSTM(hidden_dim*4, return_sequences=True, activation=activation_2, dropout=0.05),
#                         LSTM(hidden_dim*1, return_sequences=False, activation=activation_1, dropout=0.05),
#                         Dense(output_dim, activation='sigmoid')
#                     ])

#                     model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"

#                     # LSTM 모델 로드 또는 초기화
#                     try:
#                         model.load_weights(model_filename)
#                         print(f"Loaded existing LSTM model for {protein} with {activation_1} and {activation_2}")
#                     except FileNotFoundError:
#                         print(f"Initialized new LSTM model for {protein} with {activation_1} and {activation_2}")

#                     for i in range(num_iterations):
#                         print(f"Iteration {i + 1} for {protein} with LSTM batch {offset}...")

#                         X_train_lstm = np.expand_dims(X_train, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
#                         X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
#                         model, precision, recall, f1, mAP = train_and_evaluate_lstm_tf(model, X_train_lstm, X_test_lstm, y_train, y_test, f"{protein} - LSTM", i + 1, batch_size, num_epochs)

#                         # Precision 기록
#                         precision_records.append((protein, activation_1, activation_2, precision, recall, f1, mAP, offset))

#                         if monitor_memory():
#                             print("Memory usage exceeded threshold. Clearing memory...")
#                             clear_memory()

#                         # 모델 저장
#                         model.save_weights(model_filename)
#                         print(f"Model saved: {model_filename}")
#                 total_records.extend(precision_records)
#                 precision_records = []

#             else:
#                 model = get_model(model_type)
#                 model_filename = f"{protein}_{model_type}_model_iteration_1.pkl"

#                 # 모델 로드 또는 초기화
#                 try:
#                     model = joblib.load(model_filename)
#                     print(f"Loaded existing {model_type.upper()} model for {protein}")
#                 except FileNotFoundError:
#                     print(f"Initialized new {model_type.upper()} model for {protein}")

#                 for i in range(num_iterations):
#                     print(f"Iteration {i + 1} for {protein} with {model_type.upper()} batch {offset}...")

#                     if model_type in ['xgboost', 'catboost']:
#                         model = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, f"{protein} - {model_type.upper()}", i + 1, model_type)
#                     elif model_type == 'randomforest':
#                         model = train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, f"{protein} - {model_type.upper()}", i + 1)

#                     if monitor_memory():
#                         print("Memory usage exceeded threshold. Clearing memory...")
#                         clear_memory()

#                     # 모델 저장
#                     joblib.dump(model, model_filename)
#                     print(f"Model saved: {model_filename}")

#             # 각 best_combinations마다 average precision과 average recall 계산 및 출력 (LSTM 모델에 대해서만)
#             if model_type == 'lstm':
#                 precision_df = pd.DataFrame(total_records, columns=['Protein', 'Activation 1', 'Activation 2', 'Precision', 'Recall', 'f1-score', 'mAP', 'batch_num'])
#                 avg_precision = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['Precision'].mean().reset_index()
#                 avg_recall = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['Recall'].mean().reset_index()
#                 avg_f1 = precision_df.groupby(['Protein', 'Activation 1', 'Activation 2'])['f1-score'].mean().reset_index()

#                 print(f"Average Recall by Activation Function Combinations (Batch {offset}):")
#                 print(avg_recall)
#                 print(f"Average Precision by Activation Function Combinations (Batch {offset}):")
#                 print(avg_precision)
#                 print(f"Average f1-score by Activation Function Combinations (Batch {offset}):")
#                 print(avg_f1)
# con.close()

# # 모든 기록을 데이터프레임으로 변환하여 저장
# results_df = pd.DataFrame(total_records, columns=['Protein', 'Activation 1', 'Activation 2', 'Precision', 'Recall', 'f1-score', 'mAP', 'batch_num'])
# results_df.to_csv('total_precision_records.csv', index=False)


TensorFlow GPU memory growth enabled
1521460
Processing BRD4 batch starting at offset 0...
Processing BRD4 train test split done...
Processing BRD4 with CATBOOST...
Initialized new CATBOOST model for BRD4
Iteration 1 for BRD4 with CATBOOST batch 0...
0:	learn: 0.6219602	total: 38.5ms	remaining: 38.5s
100:	learn: 0.1965713	total: 3.6s	remaining: 32.1s
200:	learn: 0.1571676	total: 8.29s	remaining: 33s
300:	learn: 0.1361743	total: 12.8s	remaining: 29.8s
400:	learn: 0.1216822	total: 17.5s	remaining: 26.1s
500:	learn: 0.1103377	total: 22.1s	remaining: 22s
600:	learn: 0.1012215	total: 25.2s	remaining: 16.7s
700:	learn: 0.0934380	total: 26.9s	remaining: 11.5s
800:	learn: 0.0863934	total: 28.9s	remaining: 7.19s
900:	learn: 0.0804095	total: 31.2s	remaining: 3.42s
999:	learn: 0.0752043	total: 33.4s	remaining: 0us
BRD4 - CATBOOST - Iteration 1 - Mean Average Precision (mAP): 0.96000
BRD4 - CATBOOST - Iteration 1 - Precision: 0.91305
BRD4 - CATBOOST - Iteration 1 - Recall: 0.88629
BRD4 - CATBOOST 

test 이걸로 하면 됨 240629

In [40]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# GPU 설정 확인 및 메모리 사용량 조절
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("TensorFlow GPU memory growth enabled")
    except RuntimeError as e:
        print(e)
else:
    print("TensorFlow GPU not available")

# 데이터베이스 연결
con = duckdb.connect(database=':memory:')

# 테스트 데이터 경로 설정
test_parquet_path = 'test_processed_data.parquet'

# 각 protein에 대한 최적의 조합을 설정
best_combinations = {
    'BRD4': [('elu', 'relu')],
    'HSA': [('relu', 'elu')],
    'sEH': [('elu', 'relu')]
}

# 사용할 모델 종류 지정
#model_types = ['lstm', 'xgboost', 'randomforest', 'catboost']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
model_types = ['catboost']  # 여기서 원하는 모델 종류를 리스트로 선택하세요

# 결과 저장을 위한 리스트
results = []

batch_size = 20000  # 적절한 배치 크기로 설정

# 메모리 정리 함수
def clear_memory():
    gc.collect()
    cp._default_memory_pool.free_all_blocks()
    print("Memory cleared.")

for protein in best_combinations.keys():
    # 총 행 수 계산
    total_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{test_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch_df = con.execute(f"""
            SELECT * FROM read_parquet('{test_parquet_path}')
            WHERE protein_name = '{protein}'
            LIMIT {batch_size} OFFSET {start}
        """).df()

        # 테스트 데이터 전처리
        X_test = np.concatenate([
            np.array(batch_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)

        for model_type in model_types:
            for activation_1, activation_2 in best_combinations[protein]:
                if model_type == 'xgboost':
                    model = XGBClassifier()
                    model_filename = f"{protein}_xgb_model_iteration_1.pkl"
                elif model_type == 'randomforest':
                    model = RandomForestClassifier()
                    model_filename = f"{protein}_rf_model_iteration_1.pkl"
                elif model_type == 'catboost':
                    model = CatBoostClassifier()
                    model_filename = f"{protein}_catboost_model_iteration_1.pkl"
                elif model_type == 'lstm':
                    input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
                    hidden_dim = 128
                    output_dim = 1

                    model = Sequential([
                        Input(shape=(None, input_dim)),
                        LSTM(hidden_dim * 8, return_sequences=True, activation=activation_1, dropout=0.05),
                        LSTM(hidden_dim * 4, return_sequences=True, activation=activation_2, dropout=0.05),
                        LSTM(hidden_dim * 1, return_sequences=False, activation=activation_1, dropout=0.05),
                        Dense(output_dim, activation='sigmoid')
                    ])
                    model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"

                # 모델 로드
                if model_type == 'lstm':
                    model.load_weights(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein} with {activation_1} and {activation_2}")
                else:
                    model = joblib.load(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein}")

                # 예측 수행
                if model_type == 'lstm':
                    X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
                    y_pred = model.predict(X_test_lstm)
                elif model_type == 'randomforest':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'xgboost':
                    X_test_gpu = cp.array(X_test)
                    y_pred = model.predict_proba(X_test_gpu)[:, 1]
                    y_pred = cp.asnumpy(y_pred)
                elif model_type == 'catboost':
                    y_pred = model.predict_proba(X_test)[:, 1]

                # 결과 저장
                test_results = pd.DataFrame({
                    'id': batch_df['id'],
                    'binds': y_pred.flatten()
                })
                results.append(test_results)

                # 메모리 정리
                clear_memory()

# 모든 결과를 하나의 DataFrame으로 결합
final_results_df = pd.concat(results, ignore_index=True)

# 결과 저장
final_results_df.to_csv('test_results.csv', index=False)
print("Test results saved to 'test_results.csv'")

con.close()


TensorFlow GPU memory growth enabled
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model for BRD4
Memory cleared.
Loaded CATBOOST model f

train파일검토코드-캐글

In [30]:
import pandas as pd
import pyarrow.parquet as pq

# 새로운 Parquet 파일에서 buildingblock1_smiles 컬럼으로 특정 값과 일치하는 행 추출
final_parquet_file = './final_train_enc.parquet'
final_parquet_file = './test.parquet'
final_parquet_file = './train.parquet'

# 특정 값
target_smiles = 'O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21'

# Parquet 파일을 읽기
parquet_file = pq.ParquetFile(final_parquet_file)

# 청크 단위로 데이터 읽기 및 필터링
matched_smiles_list = []

# 각 row group을 청크 단위로 처리
for i in range(parquet_file.num_row_groups):
    df_chunk = parquet_file.read_row_group(i).to_pandas()
    matched_smiles_df_chunk = df_chunk[df_chunk['buildingblock1_smiles'].str.startswith(target_smiles)]
    matched_smiles_list.append(matched_smiles_df_chunk)

# 모든 청크를 합쳐서 하나의 DataFrame으로 만들기
matched_smiles_df = pd.concat(matched_smiles_list, ignore_index=True)

# 일치하는 데이터 목록 확인
print("Shape of data:", matched_smiles_df.shape)
print(matched_smiles_df.head(20))

# DataFrame 저장 또는 다른 방식으로 활용
matched_smiles_df.to_csv('./matched_smiles_data3.csv', index=False)


Shape of data: (1089744, 7)
           id                             buildingblock1_smiles  \
0   200462670  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
1   200462671  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
2   200462672  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
3   200462673  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
4   200462674  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
5   200462675  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
6   200462676  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
7   200462677  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
8   200462678  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
9   200462679  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
10  200462680  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
11  200462681  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
12  200462682  O=C(Nc1ccc(F)c(C(=O)O)c1F)OCC1c2ccccc2-c2ccccc21   
13  200462683  O=C(Nc1ccc(F)c(C(=O

train & catboost & 1DCNN 학습모델 & 
fold 개선& 모듈화 + ECFP 추가

In [None]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import psutil
import pyarrow.parquet as pq
import pyarrow as pa
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm

# 파라미터 설정
xgb_params = {
    'n_estimators': 100,
    'device': 'cuda',
    'eta': 0.1,
    'max_depth': 14,
    'updater': 'grow_gpu_hist',
    'refresh_leaf': 1,
    'process_type': 'default',
    'use_label_encoder': False,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}

rf_params = {
    'n_estimators': 100,
    'n_jobs': -1
}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 10,
    'task_type': 'GPU',
    'verbose': 100
}

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("TensorFlow GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    else:
        print("TensorFlow GPU not available")

def clear_memory():
    gc.collect()
    print("Memory cleared.")

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, fold, model_type):
    X_train = cp.asnumpy(X_train)
    X_test = cp.asnumpy(X_test)

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    map_score = average_precision_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, model_name, fold):
    X_train_np = cp.asnumpy(X_train)
    X_test_np = cp.asnumpy(X_test)

    model.fit(X_train_np, y_train)
    y_pred_proba = model.predict_proba(X_test_np)[:, 1]
    y_pred = model.predict(X_test_np)
    y_pred_proba_np = cp.asnumpy(y_pred_proba)
    y_pred_np = cp.asnumpy(y_pred)

    map_score = average_precision_score(y_test, y_pred_proba_np)
    accuracy = accuracy_score(y_test, y_pred_np)
    precision = precision_score(y_test, y_pred_np)
    recall = recall_score(y_test, y_pred_np)
    f1 = f1_score(y_test, y_pred_np)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    del X_train_np, X_test_np, y_pred_proba, y_pred
    cp._default_memory_pool.free_all_blocks()
    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_lstm_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.Recall()])

    model.fit(X_train_tensor, y_train_tensor, epochs=num_epochs, batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def OneDCNN_model(input_len, num_classes):
    hidden_dim = 128
    num_filters = 32

    inputs = tf.keras.layers.Input(shape=(input_len,))
    x = Embedding(input_dim=36, output_dim=hidden_dim, input_length=input_len, mask_zero=True)(inputs)
    x = Conv1D(filters=num_filters, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = GlobalMaxPooling1D()(x)

    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy'
    weighted_metrics = [AUC(curve='PR', name='avg_precision')]
    model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
    )
    return model

def train_and_evaluate_cnn_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.Recall()])

    es = EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)
    checkpoint = ModelCheckpoint(monitor='val_loss', filepath=f"{model_name}.h5",
                                 save_best_only=True, save_weights_only=True, mode='min')
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, verbose=1)

    model.fit(X_train_tensor, y_train_tensor, validation_data=(X_test_tensor, y_test_tensor),
              epochs=num_epochs, callbacks=[checkpoint, reduce_lr_loss, es],
              batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def get_model(model_type, input_len=None, num_classes=None):
    if model_type == 'random_forest':
        return RandomForestClassifier(**rf_params)
    elif model_type == 'xgboost':
        return XGBClassifier(**xgb_params)
    elif model_type == 'catboost':
        return CatBoostClassifier(**catboost_params)
    elif model_type == 'cnn':
        return OneDCNN_model(input_len, num_classes)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def monitor_memory(threshold=0.80):
    memory_info = psutil.virtual_memory()
    return memory_info.percent / 100 >= threshold

# ECFP 생성 함수
def generate_ecfp(smiles, radius=2, bits=1024):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return [0] * bits
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

def process_batch(df):
    # 각 SMILES 열에 대해 ECFP 생성
    df['molecule_ecfp'] = df['molecule_smiles'].apply(generate_ecfp)
    df['buildingblock1_ecfp'] = df['buildingblock1_smiles'].apply(generate_ecfp)
    df['buildingblock2_ecfp'] = df['buildingblock2_smiles'].apply(generate_ecfp)
    df['buildingblock3_ecfp'] = df['buildingblock3_smiles'].apply(generate_ecfp)
    
    # 필요한 열만 포함된 DataFrame 반환
    return df[['id', 'protein_name', 'molecule_ecfp', 'buildingblock1_ecfp', 'buildingblock2_ecfp', 'buildingblock3_ecfp', 'binds']]

def preprocess_and_save_ecfp(input_path, output_path, batch_size=32768):
    reader = pq.ParquetFile(input_path)
    
    # 적절한 스키마로 Parquet writer 초기화
    schema = pa.schema([
        ('id', pa.int32()),
        ('molecule_ecfp', pa.list_(pa.int32())),
        ('buildingblock1_ecfp', pa.list_(pa.int32())),
        ('buildingblock2_ecfp', pa.list_(pa.int32())),
        ('buildingblock3_ecfp', pa.list_(pa.int32())),
        ('protein_name', pa.string()),
        ('binds', pa.int32())  # test할 때는 제외
    ])
    
    with pq.ParquetWriter(output_path, schema) as writer:
        total_batches = reader.metadata.num_row_groups
        
        with tqdm(total=total_batches, desc="Processing", unit="batch", leave=True) as pbar:
            for batch in reader.iter_batches(batch_size=batch_size):
                df_batch = batch.to_pandas()
                
                processed_batch = process_batch(df_batch)
                
                # 처리된 DataFrame을 Arrow Table로 변환하여 파일에 작성
                table = pa.Table.from_pandas(processed_batch, schema=schema)
                writer.write_table(table)
                
                # 진행 상황 업데이트
                pbar.update(1)

                # 주기적으로 가비지 컬렉션 호출
                gc.collect()

def process_protein_batches(protein, model_types, input_parquet_path, num_folds, ecfp=False):
    con = duckdb.connect()
    total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
    print(total_rows)
    batch_size = 100000

    for offset in range(0, total_rows, batch_size):
        filtered_df = con.execute(f"""
        SELECT * FROM parquet_scan('{input_parquet_path}') 
        WHERE protein_name = '{protein}'
        LIMIT {batch_size} OFFSET {offset}
        """).df()

        print(f"Processing {protein} batch starting at offset {offset}...")

        if ecfp:
            filtered_df = process_batch(filtered_df)

        X = np.concatenate([
            np.array(filtered_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)
        y = filtered_df['binds'].tolist()

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

        for model_type in model_types:
            print(f"Processing {protein} with {model_type.upper()}...")
            process_model_type(model_type, skf, X, y, protein, offset)

    con.close()

def process_model_type(model_type, skf, X, y, protein, offset):
    precision_records = []
    best_map_score = 0
    best_model = None
    best_fold = None

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold + 1} for {protein} with {model_type.upper()} batch {offset}...")

        model = get_model(model_type)
        model_filename = f"{protein}_{model_type}_model_fold_{fold}.pkl"

        try:
            model = joblib.load(model_filename)
            print(f"Loaded existing {model_type.upper()} model for {protein}")
        except FileNotFoundError:
            print(f"Initialized new {model_type.upper()} model for {protein}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        if model_type in ['xgboost', 'catboost']:
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1, model_type)
        elif model_type == 'random_forest':
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model_rf(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1)

        precision_records.append((protein, model_type, 'N/A', precision, recall, f1, map_score, offset, fold + 1))

        if map_score > best_map_score:
            best_map_score = map_score
            best_model = model
            best_fold = fold

        if monitor_memory():
            print("Memory usage exceeded threshold. Clearing memory...")
            clear_memory()

        joblib.dump(model, model_filename)
        print(f"Model saved: {model_filename}")

    if best_model is not None:
        best_model_filename = f"{protein}_{model_type}_best_model.pkl"
        joblib.dump(best_model, best_model_filename)
        print(f"Best model saved: {best_model_filename}")

def main():
    setup_gpu()
    protein_names = ['BRD4', 'HSA', 'sEH']
    model_types = ['catboost']
    num_folds = 5
    input_parquet_path = 'processed_merged_queried_data.parquet'
    ecfp = True  # ECFP 생성 여부 설정

    if ecfp:
        output_parquet_path = 'processed_merged_queried_ecfp_data.parquet'
        preprocess_and_save_ecfp(input_parquet_path, output_parquet_path)
        input_parquet_path = output_parquet_path

    for protein in protein_names:
        process_protein_batches(protein, model_types, input_parquet_path, num_folds, ecfp=ecfp)

if __name__ == "__main__":
    main()


train & catboost & 1DCNN 학습모델 & 
fold 개선& 모듈화 - 240702실행 성공

In [6]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import psutil

# 파라미터 설정
xgb_params = {
    'n_estimators': 100,
    'device': 'cuda',
    'eta': 0.1,
    'max_depth': 14,
    'updater': 'grow_gpu_hist',
    'refresh_leaf': 1,
    'process_type': 'default',
    'use_label_encoder': False,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}

rf_params = {
    'n_estimators': 100,
    'n_jobs': -1
}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 10,
    'task_type': 'GPU',
    'verbose': 100
}

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("TensorFlow GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    else:
        print("TensorFlow GPU not available")

def clear_memory():
    gc.collect()
    print("Memory cleared.")

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, fold, model_type):
    X_train = cp.asnumpy(X_train)
    X_test = cp.asnumpy(X_test)

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    map_score = average_precision_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, model_name, fold):
    X_train_np = cp.asnumpy(X_train)
    X_test_np = cp.asnumpy(X_test)

    model.fit(X_train_np, y_train)
    y_pred_proba = model.predict_proba(X_test_np)[:, 1]
    y_pred = model.predict(X_test_np)
    y_pred_proba_np = cp.asnumpy(y_pred_proba)
    y_pred_np = cp.asnumpy(y_pred)

    map_score = average_precision_score(y_test, y_pred_proba_np)
    accuracy = accuracy_score(y_test, y_pred_np)
    precision = precision_score(y_test, y_pred_np)
    recall = recall_score(y_test, y_pred_np)
    f1 = f1_score(y_test, y_pred_np)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    del X_train_np, X_test_np, y_pred_proba, y_pred
    cp._default_memory_pool.free_all_blocks()
    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_lstm_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.Recall()])

    model.fit(X_train_tensor, y_train_tensor, epochs=num_epochs, batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def OneDCNN_model(input_len, num_classes):
    hidden_dim = 128
    num_filters = 32

    inputs = tf.keras.layers.Input(shape=(input_len,))
    x = Embedding(input_dim=36, output_dim=hidden_dim, input_length=input_len, mask_zero=True)(inputs)
    x = Conv1D(filters=num_filters, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = GlobalMaxPooling1D()(x)

    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy'
    weighted_metrics = [AUC(curve='PR', name='avg_precision')]
    model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
    )
    return model

def train_and_evaluate_cnn_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.Recall()])

    es = EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)
    checkpoint = ModelCheckpoint(monitor='val_loss', filepath=f"{model_name}.h5",
                                 save_best_only=True, save_weights_only=True, mode='min')
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, verbose=1)

    model.fit(X_train_tensor, y_train_tensor, validation_data=(X_test_tensor, y_test_tensor),
              epochs=num_epochs, callbacks=[checkpoint, reduce_lr_loss, es],
              batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def get_model(model_type, input_len=None, num_classes=None):
    if model_type == 'random_forest':
        return RandomForestClassifier(**rf_params)
    elif model_type == 'xgboost':
        return XGBClassifier(**xgb_params)
    elif model_type == 'catboost':
        return CatBoostClassifier(**catboost_params)
    elif model_type == 'cnn':
        return OneDCNN_model(input_len, num_classes)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def monitor_memory(threshold=0.80):
    memory_info = psutil.virtual_memory()
    return memory_info.percent / 100 >= threshold

def process_protein_batches(protein, model_types, input_parquet_path, num_folds):
    con = duckdb.connect()
    total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
    print(total_rows)
    batch_size = 100000

    for offset in range(0, total_rows, batch_size):
        filtered_df = con.execute(f"""
        SELECT * FROM parquet_scan('{input_parquet_path}') 
        WHERE protein_name = '{protein}'
        LIMIT {batch_size} OFFSET {offset}
        """).df()

        print(f"Processing {protein} batch starting at offset {offset}...")

        X = np.concatenate([
            np.array(filtered_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)
        y = filtered_df['binds'].tolist()

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

        for model_type in model_types:
            print(f"Processing {protein} with {model_type.upper()}...")
            process_model_type(model_type, skf, X, y, protein, offset)

    con.close()

def process_protein_batches_type2(protein, model_types, input_parquet_path, num_folds):
    con = duckdb.connect()
    total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
    print(total_rows)
    batch_size = 99999

    for offset in range(0, total_rows, batch_size):
        filtered_df = con.execute(f"""
        SELECT * FROM parquet_scan('{input_parquet_path}') 
        LIMIT {batch_size} OFFSET {offset}
        """).df()

        print(f"Processing {protein} batch starting at offset {offset}...")
        
        binds_1 = filtered_df[filtered_df['bind'] == 1]
        binds_0 = filtered_df[filtered_df['bind'] == 0].sample(n=len(binds_1) * 2, random_state=42)

        sampled_df = pd.concat([binds_1, binds_0]).sample(frac=1, random_state=42).reset_index(drop=True)
        
        X = np.array(sampled_df[[f'enc{i}' for i in range(142)]].tolist(), dtype=np.float32)
        y = sampled_df['bind'].tolist()

        # 명시적으로 메모리 해제
        del filtered_df, binds_1, binds_0, sampled_df
        clear_memory()

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

        for model_type in model_types:
            print(f"Processing {protein} with {model_type.upper()}...")
            process_model_type(model_type, skf, X, y, protein, offset)
    con.close()

def process_model_type(model_type, skf, X, y, protein, offset):
    precision_records = []
    best_map_score = 0
    best_model = None
    best_fold = None

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold + 1} for {protein} with {model_type.upper()} batch {offset}...")

        model = get_model(model_type)
        model_filename = f"{protein}_{model_type}_model_fold_{fold}.pkl"

        try:
            model = joblib.load(model_filename)
            print(f"Loaded existing {model_type.upper()} model for {protein}")
        except FileNotFoundError:
            print(f"Initialized new {model_type.upper()} model for {protein}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        if model_type in ['xgboost', 'catboost']:
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1, model_type)
        elif model_type == 'random_forest':
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model_rf(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1)

        precision_records.append((protein, model_type, 'N/A', precision, recall, f1, map_score, offset, fold + 1))

        if map_score > best_map_score:
            best_map_score = map_score
            best_model = model
            best_fold = fold

        if monitor_memory():
            print("Memory usage exceeded threshold. Clearing memory...")
            clear_memory()

        joblib.dump(model, model_filename)
        print(f"Model saved: {model_filename}")

    if best_model is not None:
        best_model_filename = f"{protein}_{model_type}_best_model.pkl"
        joblib.dump(best_model, best_model_filename)
        print(f"Best model saved: {best_model_filename}")

def main():
    setup_gpu()
    protein_names = ['BRD4', 'HSA', 'sEH']
    model_types = ['catboost']
    num_folds = 5
    input_parquet_path = 'processed_merged_queried_data.parquet'

    for protein in protein_names:
        # process_protein_batches(protein, model_types, input_parquet_path, num_folds)
        if model_types != "transformer":
            process_protein_batches(protein, model_types, input_parquet_path, num_folds)
        else:
            input_parquet_path = f"train_enc_{protein}.parquet"
            process_protein_batches_type2(protein, model_types, input_parquet_path, num_folds)


if __name__ == "__main__":
    main()


TensorFlow GPU memory growth enabled
1521460
Processing BRD4 batch starting at offset 0...
Processing BRD4 with XGBOOST...
Fold 1 for BRD4 with XGBOOST batch 0...
Loaded existing XGBOOST model for BRD4


Parameters: { "use_label_encoder" } are not used.



KeyboardInterrupt: 

train & catboost & 1DCNN 학습모델 & 
fold 개선& 모듈화 - 240702실행 성공 + single fold transformer -> 실패
lightGBM 추가 시도

In [12]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
import psutil

# 파라미터 설정
xgb_params = {
    'n_estimators': 100,
    'device': 'cuda',
    'eta': 0.1,
    'max_depth': 14,
    'updater': 'grow_gpu_hist',
    'refresh_leaf': 1,
    'process_type': 'default',
    'use_label_encoder': False,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}

rf_params = {
    'n_estimators': 100,
    'n_jobs': -1
}

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 10,
    'task_type': 'GPU',
    'verbose': 100
}

lgbm_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'num_leaves': 31,
    'objective': 'binary',
    'device': 'gpu'
}

def setup_gpu():
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        try:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("TensorFlow GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    else:
        print("TensorFlow GPU not available")

def clear_memory():
    gc.collect()
    print("Memory cleared.")

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, fold, model_type):
    X_train = cp.asnumpy(X_train)
    X_test = cp.asnumpy(X_test)

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    map_score = average_precision_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_model_rf(model, X_train, X_test, y_train, y_test, model_name, fold):
    X_train_np = cp.asnumpy(X_train)
    X_test_np = cp.asnumpy(X_test)

    model.fit(X_train_np, y_train)
    y_pred_proba = model.predict_proba(X_test_np)[:, 1]
    y_pred = model.predict(X_test_np)
    y_pred_proba_np = cp.asnumpy(y_pred_proba)
    y_pred_np = cp.asnumpy(y_pred)

    map_score = average_precision_score(y_test, y_pred_proba_np)
    accuracy = accuracy_score(y_test, y_pred_np)
    precision = precision_score(y_test, y_pred_np)
    recall = recall_score(y_test, y_pred_np)
    f1 = f1_score(y_test, y_pred_np)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    del X_train_np, X_test_np, y_pred_proba, y_pred
    cp._default_memory_pool.free_all_blocks()
    clear_memory()
    return model, map_score, accuracy, precision, recall, f1

def train_and_evaluate_lstm_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.Recall()])

    model.fit(X_train_tensor, y_train_tensor, epochs=num_epochs, batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def OneDCNN_model(input_len, num_classes):
    hidden_dim = 128
    num_filters = 32

    inputs = tf.keras.layers.Input(shape=(input_len,))
    x = Embedding(input_dim=36, output_dim=hidden_dim, input_length=input_len, mask_zero=True)(inputs)
    x = Conv1D(filters=num_filters, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = Conv1D(filters=num_filters*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    
    x = GlobalMaxPooling1D()(x)

    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy'
    weighted_metrics = [AUC(curve='PR', name='avg_precision')]
    model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
    )
    return model

def train_and_evaluate_cnn_tf(model, X_train, X_test, y_train, y_test, model_name, fold, batch_size, num_epochs):
    X_train_tensor = tf.convert_to_tensor(X_train, dtype=tf.float32)
    X_test_tensor = tf.convert_to_tensor(X_test, dtype=tf.float32)
    y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
    y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float32)

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.Recall()])

    es = EarlyStopping(patience=5, monitor="val_loss", mode='min', verbose=1)
    checkpoint = ModelCheckpoint(monitor='val_loss', filepath=f"{model_name}.h5",
                                 save_best_only=True, save_weights_only=True, mode='min')
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.05, patience=5, verbose=1)

    model.fit(X_train_tensor, y_train_tensor, validation_data=(X_test_tensor, y_test_tensor),
              epochs=num_epochs, callbacks=[checkpoint, reduce_lr_loss, es],
              batch_size=batch_size, verbose=1)

    y_pred_proba = model.predict(X_test_tensor)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    map_score = average_precision_score(y_test_tensor.numpy(), y_pred)
    accuracy = accuracy_score(y_test_tensor.numpy(), y_pred)
    precision = precision_score(y_test_tensor.numpy(), y_pred)
    recall = recall_score(y_test_tensor.numpy(), y_pred)
    f1 = f1_score(y_test_tensor.numpy(), y_pred)
    print(f"{model_name} - Fold {fold} - Mean Average Precision (mAP): {map_score:.5f}")
    print(f"{model_name} - Fold {fold} - Precision: {precision:.5f}")
    print(f"{model_name} - Fold {fold} - Recall: {recall:.5f}")
    print(f"{model_name} - Fold {fold} - F1 Score: {f1:.5f}")

    clear_memory()
    return model, precision, recall, f1, map_score

def get_model(model_type, input_len=None, num_classes=None):
    if model_type == 'random_forest':
        return RandomForestClassifier(**rf_params)
    elif model_type == 'xgboost':
        return XGBClassifier(**xgb_params)
    elif model_type == 'catboost':
        return CatBoostClassifier(**catboost_params)
    elif model_type == 'cnn':
        return OneDCNN_model(input_len, num_classes)
    elif model_type == 'lgbm':
        return LGBMClassifier(**lgbm_params)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def monitor_memory(threshold=0.80):
    memory_info = psutil.virtual_memory()
    return memory_info.percent / 100 >= threshold

def process_protein_batches(protein, model_types, input_parquet_path, num_folds):
    con = duckdb.connect()
    total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
    print(total_rows)
    batch_size = 100000

    for offset in range(0, total_rows, batch_size):
        filtered_df = con.execute(f"""
        SELECT * FROM parquet_scan('{input_parquet_path}') 
        WHERE protein_name = '{protein}'
        LIMIT {batch_size} OFFSET {offset}
        """).df()

        print(f"Processing {protein} batch starting at offset {offset}...")

        X = np.concatenate([
            np.array(filtered_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(filtered_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)
        y = filtered_df['binds'].tolist()

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

        for model_type in model_types:
            print(f"Processing {protein} with {model_type.upper()}...")
            process_model_type(model_type, skf, X, y, protein, offset)

    con.close()

def process_protein_batches_type2(protein, model_types, input_parquet_path, num_folds):
    con = duckdb.connect()
    total_rows = con.execute(f"SELECT COUNT(*) FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]
    print(total_rows)
    batch_size = 99999

    for offset in range(0, total_rows, batch_size):
        filtered_df = con.execute(f"""
        SELECT * FROM parquet_scan('{input_parquet_path}') 
        LIMIT {batch_size} OFFSET {offset}
        """).df()

        print(f"Processing {protein} batch starting at offset {offset}...")
        
        binds_1 = filtered_df[filtered_df['bind'] == 1]
        binds_0 = filtered_df[filtered_df['bind'] == 0].sample(n=len(binds_1) * 2, random_state=42)

        sampled_df = pd.concat([binds_1, binds_0]).sample(frac=1, random_state=42).reset_index(drop=True)
        
        X = np.array(sampled_df[[f'enc{i}' for i in range(142)]].tolist(), dtype=np.float32)
        y = sampled_df['bind'].tolist()

        # 명시적으로 메모리 해제
        del filtered_df, binds_1, binds_0, sampled_df
        clear_memory()

        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

        for model_type in model_types:
            print(f"Processing {protein} with {model_type.upper()}...")
            process_model_type(model_type, skf, X, y, protein, offset)
    con.close()

def process_model_type(model_type, skf, X, y, protein, offset):
    precision_records = []
    best_map_score = 0
    best_model = None
    best_fold = None

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold + 1} for {protein} with {model_type.upper()} batch {offset}...")

        model = get_model(model_type)
        model_filename = f"{protein}_{model_type}_model_fold_{fold}.pkl"

        try:
            model = joblib.load(model_filename)
            print(f"Loaded existing {model_type.upper()} model for {protein}")
        except FileNotFoundError:
            print(f"Initialized new {model_type.upper()} model for {protein}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = np.array(y)[train_idx], np.array(y)[val_idx]

        if model_type in ['xgboost', 'catboost', 'lgbm']:
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1, model_type)
        elif model_type == 'random_forest':
            model, map_score, accuracy, precision, recall, f1 = train_and_evaluate_model_rf(model, X_train, X_val, y_train, y_val, f"{protein} - {model_type.upper()}", fold + 1)

        precision_records.append((protein, model_type, 'N/A', precision, recall, f1, map_score, offset, fold + 1))

        if map_score > best_map_score:
            best_map_score = map_score
            best_model = model
            best_fold = fold

        if monitor_memory():
            print("Memory usage exceeded threshold. Clearing memory...")
            clear_memory()

        joblib.dump(model, model_filename)
        print(f"Model saved: {model_filename}")

    if best_model is not None:
        best_model_filename = f"{protein}_{model_type}_best_model.pkl"
        joblib.dump(best_model, best_model_filename)
        print(f"Best model saved: {best_model_filename}")

def main():
    setup_gpu()
    protein_names = ['BRD4', 'HSA', 'sEH']
    model_types = ['lgbm']#'catboost', 
    num_folds = 5
    input_parquet_path = 'processed_merged_queried_data.parquet'

    for protein in protein_names:
        # process_protein_batches(protein, model_types, input_parquet_path, num_folds)
        if model_types != "transformer":
            process_protein_batches(protein, model_types, input_parquet_path, num_folds)
        else:
            input_parquet_path = f"train_enc_{protein}.parquet"
            process_protein_batches_type2(protein, model_types, input_parquet_path, num_folds)


if __name__ == "__main__":
    main()


TensorFlow GPU memory growth enabled
1521460
Processing BRD4 batch starting at offset 0...
Processing BRD4 with LGBM...
Fold 1 for BRD4 with LGBM batch 0...
Initialized new LGBM model for BRD4
[LightGBM] [Info] Number of positive: 24015, number of negative: 55985
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 7322
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 3661
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 39 dense feature groups (3.05 MB) transferred to GPU in 0.006602 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.300187 -> initscore=-0.846405
[LightGBM] [Info] Start training from score -0.846405
BRD4 - LGBM - Fold 1 - Mean Average Precision (mAP): 0.94730
BR

test코드 1DCNN 추가 + transformer(취소)
lgbm 추가

In [2]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# GPU 설정 확인 및 메모리 사용량 조절
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("TensorFlow GPU memory growth enabled")
    except RuntimeError as e:
        print(e)
else:
    print("TensorFlow GPU not available")

# 데이터베이스 연결
con = duckdb.connect(database=':memory:')

# 테스트 데이터 경로 설정
test_parquet_path = 'test_processed_data.parquet'

# 각 protein에 대한 최적의 조합을 설정
best_combinations = {
    'BRD4': [('elu', 'relu')],
    'HSA': [('relu', 'elu')],
    'sEH': [('elu', 'relu')]
}

# 사용할 모델 종류 지정
model_types = ['lgbm']  # 여기에 LGBM 모델을 추가했습니다'catboost', 'cnn', 

# 결과 저장을 위한 리스트
results = []

batch_size = 20000  # 적절한 배치 크기로 설정

# 메모리 정리 함수
def clear_memory():
    gc.collect()
    cp._default_memory_pool.free_all_blocks()
    print("Memory cleared.")

# 1D-CNN 모델 정의 함수
def OneDCNN_model(input_len, num_classes):
    hidden_dim = 128
    num_filters = 32

    inputs = tf.keras.layers.Input(shape=(input_len,))
    x = Embedding(input_dim=36, output_dim=hidden_dim, input_length=input_len, mask_zero=True)(inputs)
    x = Conv1D(filters=num_filters, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = Conv1D(filters=num_filters*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = Conv1D(filters=num_filters*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy'
    weighted_metrics = [tf.keras.metrics.AUC(curve='PR', name='avg_precision')]
    model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
    )
    return model

for protein in best_combinations.keys():
    # 총 행 수 계산
    total_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{test_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch_df = con.execute(f"""
            SELECT * FROM read_parquet('{test_parquet_path}')
            WHERE protein_name = '{protein}'
            LIMIT {batch_size} OFFSET {start}
        """).df()

        # 테스트 데이터 전처리
        X_test = np.concatenate([
            np.array(batch_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)

        for model_type in model_types:
            for activation_1, activation_2 in best_combinations[protein]:
                if model_type == 'xgboost':
                    model = XGBClassifier()
                    model_filename = f"{protein}_xgboost_model.pkl"
                elif model_type == 'randomforest':
                    model = RandomForestClassifier()
                    model_filename = f"{protein}_rf_model_iteration_1.pkl"
                elif model_type == 'catboost':
                    model = CatBoostClassifier()
                    model_filename = f"{protein}_catboost_model_iteration_1.pkl"
                elif model_type == 'lgbm':
                    model = LGBMClassifier()
                    model_filename = f"{protein}_lgbm_best_model.pkl"
                elif model_type == 'lstm':
                    input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
                    hidden_dim = 128
                    output_dim = 1

                    model = Sequential([
                        Input(shape=(None, input_dim)),
                        LSTM(hidden_dim * 8, return_sequences=True, activation=activation_1, dropout=0.05),
                        LSTM(hidden_dim * 4, return_sequences=True, activation=activation_2, dropout=0.05),
                        LSTM(hidden_dim * 1, return_sequences=False, activation=activation_1, dropout=0.05),
                        Dense(output_dim, activation='sigmoid')
                    ])
                    model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"
                elif model_type == 'cnn':
                    input_len = 1024 * 4
                    num_classes = 1
                    model = OneDCNN_model(input_len, num_classes)
                    # model_filename = f"{protein}_cnn_model_iteration_1.h5"
                    model_filename = f"{protein} - CNN.h5"

                # 모델 로드
                if model_type in ['lstm', 'cnn']:
                    model.load_weights(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein} with {activation_1} and {activation_2}")
                else:
                    model = joblib.load(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein}")

                # 예측 수행
                if model_type == 'lstm':
                    X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
                    y_pred = model.predict(X_test_lstm)
                elif model_type == 'randomforest':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'xgboost':
                    X_test_gpu = cp.array(X_test)
                    y_pred = model.predict_proba(X_test_gpu)[:, 1]
                    y_pred = cp.asnumpy(y_pred)
                elif model_type == 'catboost':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'lgbm':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'cnn':
                    y_pred = model.predict(X_test)

                # 결과 저장
                test_results = pd.DataFrame({
                    'id': batch_df['id'],
                    'binds': y_pred.flatten()
                })
                results.append(test_results)

                # 메모리 정리
                clear_memory()

# 모든 결과를 하나의 DataFrame으로 결합
final_results_df = pd.concat(results, ignore_index=True)

# 결과 저장
final_results_df.to_csv('test_results.csv', index=False)
print("Test results saved to 'test_results.csv'")

con.close()


TensorFlow GPU memory growth enabled
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model for BRD4
Memory cleared.
Loaded LGBM model

Singlefold transformer 시도

In [1]:
import duckdb
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import os
import gc

from sklearn.model_selection import train_test_split
from datetime import datetime
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

# Conv1dBnRelu 정의
class Conv1dBnRelu(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, is_bn=True):
        super(Conv1dBnRelu, self).__init__()
        layers = [
            nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)
        ]
        if is_bn:
            layers.append(nn.BatchNorm1d(out_channels))
        layers.append(nn.ReLU(inplace=True))
        self.conv = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.conv(x)

# Tokenization ====================================
MOLECULE_DICT = {
    'l': 1, 'y': 2, '@': 3, '3': 4, 'H': 5, 'S': 6, 'F': 7, 'C': 8, 'r': 9, 's': 10, '/': 11, 'c': 12, 'o': 13,
    '+': 14, 'I': 15, '5': 16, '(': 17, '2': 18, ')': 19, '9': 20, 'i': 21, '#': 22, '6': 23, '8': 24, '4': 25,
    '=': 26, '1': 27, 'O': 28, '[': 29, 'D': 30, 'B': 31, ']': 32, 'N': 33, '7': 34, 'n': 35, '-': 36
}
MAX_MOLECULE_ID = np.max(list(MOLECULE_DICT.values()))
VOCAB_SIZE = MAX_MOLECULE_ID + 10
UNK = 255  # disallow: will cause error
BOS = MAX_MOLECULE_ID + 1
EOS = MAX_MOLECULE_ID + 2
PAD = 0
MAX_LENGTH = 160

MOLECULE_LUT = np.full(256, fill_value=UNK, dtype=np.uint8)
for k, v in MOLECULE_DICT.items():
    ascii = ord(k)
    MOLECULE_LUT[ascii] = v

def make_token(s):
    t = np.frombuffer(s, np.uint8)
    t = MOLECULE_LUT[t]
    t = t.tolist()
    
    # Ensure token ids are within VOCAB_SIZE
    t = [tok for tok in t if tok < VOCAB_SIZE]
    
    # Truncate if length is greater than MAX_LENGTH - 2
    if len(t) > MAX_LENGTH - 2:
        t = t[:MAX_LENGTH - 2]
    
    L = len(t) + 2
    token_id = [BOS] + t + [EOS] + [PAD] * (MAX_LENGTH - L)
    token_mask = [1] * L + [0] * (MAX_LENGTH - L)
    
    return token_id, token_mask

def load_and_sample_data_batch(filepath, protein_type, ratio=1/3, batch_size=200001, random_state=42):
    conn = duckdb.connect(database=':memory:')
    conn.execute(f"INSTALL 'parquet';")
    conn.execute(f"LOAD 'parquet';")
    
    print(f"Loading data for protein type: {protein_type}, batch size: {batch_size}")
    
    pos_sample_size = int(batch_size * ratio)
    neg_sample_size = batch_size - pos_sample_size
    
    pos_query = f"""
    SELECT * FROM read_parquet('{filepath}')
    WHERE protein_name = '{protein_type}' AND binds = 1
    ORDER BY RANDOM()
    LIMIT {pos_sample_size};
    """
    
    neg_query = f"""
    SELECT * FROM read_parquet('{filepath}')
    WHERE protein_name = '{protein_type}' AND binds = 0
    ORDER BY RANDOM()
    LIMIT {neg_sample_size};
    """
    
    pos_df = conn.execute(pos_query).df()
    neg_df = conn.execute(neg_query).df()
    
    sampled_df = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    del pos_df, neg_df
    gc.collect()
    
    return sampled_df

# 데이터셋 클래스 정의
class SMILESDataSet(Dataset):
    def __init__(self, dataframe, max_length=160):
        self.dataframe = dataframe
        self.max_length = max_length

        # 열 이름을 출력하여 확인
        print("Dataframe columns:", self.dataframe.columns)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # 각각의 SMILES 문자열을 개별적으로 가져옵니다.
        buildingblock1_smiles = row['buildingblock1_smiles']
        buildingblock2_smiles = row['buildingblock2_smiles']
        buildingblock3_smiles = row['buildingblock3_smiles']
        molecule_smiles = row['molecule_smiles']

        # 각각의 SMILES 문자열을 개별적으로 토큰화합니다.
        token_id_1, token_mask_1 = make_token(buildingblock1_smiles.encode('utf-8'))
        token_id_2, token_mask_2 = make_token(buildingblock2_smiles.encode('utf-8'))
        token_id_3, token_mask_3 = make_token(buildingblock3_smiles.encode('utf-8'))
        token_id_molecule, token_mask_molecule = make_token(molecule_smiles.encode('utf-8'))

        target = row['binds']

        return {
            'buildingblock1_smiles_token_id': torch.tensor(token_id_1, dtype=torch.long),
            'buildingblock1_smiles_token_mask': torch.tensor(token_mask_1, dtype=torch.long),
            'buildingblock2_smiles_token_id': torch.tensor(token_id_2, dtype=torch.long),
            'buildingblock2_smiles_token_mask': torch.tensor(token_mask_2, dtype=torch.long),
            'buildingblock3_smiles_token_id': torch.tensor(token_id_3, dtype=torch.long),
            'buildingblock3_smiles_token_mask': torch.tensor(token_mask_3, dtype=torch.long),
            'molecule_smiles_token_id': torch.tensor(token_id_molecule, dtype=torch.long),
            'molecule_smiles_token_mask': torch.tensor(token_mask_molecule, dtype=torch.long),
            'bind': torch.tensor(target, dtype=torch.float)
        }

# 모델 정의
class FlashAttentionTransformerEncoder(nn.Module):
    def __init__(
            self,
            dim_model,
            num_layers,
            num_heads,
            dim_feedforward,
            dropout=0.0,
            norm_first=False,
            activation=F.gelu,
            rotary_emb_dim=0,
    ):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_model,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation=activation,
            batch_first=True,
            norm_first=norm_first
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
    
    def forward(self, x, src_key_padding_mask):
        return self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=256):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        embed_dim = 512
        self.output_type = ['infer', 'loss']
        self.pe = PositionalEncoding(embed_dim, max_len=256)
        self.embedding = nn.Embedding(VOCAB_SIZE, 64, padding_idx=PAD)
        self.conv_embedding = nn.Sequential(
            Conv1dBnRelu(64, embed_dim, kernel_size=3, stride=1, padding=1, is_bn=True),
        )
        self.tx_encoder = FlashAttentionTransformerEncoder(
            dim_model=embed_dim,
            num_heads=8,
            dim_feedforward=embed_dim * 4,
            dropout=0.1,
            norm_first=False,
            activation=F.gelu,
            rotary_emb_dim=0,
            num_layers=7,
        )
        self.bind = nn.Sequential(
            nn.Linear(embed_dim * 4, 1),  # multiply by 4 because we concatenate 4 outputs
        )

    def forward(self, batch):
        combined_output = []
        for key in ['buildingblock1_smiles_token_id', 'buildingblock2_smiles_token_id', 'buildingblock3_smiles_token_id', 'molecule_smiles_token_id']:
            smiles_token_id = batch[key].long()
            smiles_token_mask = batch[key.replace('_id', '_mask')].long()
            B, L = smiles_token_id.shape
            x = self.embedding(smiles_token_id)
            x = x.permute(0, 2, 1).float()
            x = self.conv_embedding(x)
            x = x.permute(0, 2, 1).contiguous()
            x = self.pe(x)
            z = self.tx_encoder(
                x=x,
                src_key_padding_mask=smiles_token_mask == 0,
            )
            m = smiles_token_mask.unsqueeze(-1).float()
            pool = (z * m).sum(1) / m.sum(1)
            combined_output.append(pool)
        
        combined_output = torch.cat(combined_output, dim=1)
        bind = self.bind(combined_output)
        output = {}
        if 'loss' in self.output_type:
            if 'bind' in batch:
                target = batch['bind'].unsqueeze(1)
                output['bce_loss'] = F.binary_cross_entropy_with_logits(bind.float(), target.float(), reduction='mean')
            else:
                output['bce_loss'] = torch.tensor(0.0, requires_grad=True)  # 예외 처리
        if 'infer' in self.output_type:
            output['bind'] = torch.sigmoid(bind)
        return output

# 학습 함수 수정
def train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        running_loss = 0.0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        for batch in train_loader_tqdm:
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs['bind'], batch['bind'].unsqueeze(1))
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            train_loader_tqdm.set_postfix(loss=running_loss / (train_loader_tqdm.n + 1))

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader_tqdm)}")

        # 검증
        model.eval()
        val_loss = 0.0
        val_loader_tqdm = tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{num_epochs}")
        with torch.no_grad():
            for batch in val_loader_tqdm:
                outputs = model(batch)
                loss = criterion(outputs['bind'], batch['bind'].unsqueeze(1))
                val_loss += loss.item()
                val_loader_tqdm.set_postfix(val_loss=val_loss / (val_loader_tqdm.n + 1))
                
        print(f"Validation Loss: {val_loss / len(val_loader_tqdm)}")
        model.train()
        
        # 가비지 컬렉션 수행
        gc.collect()

# 전체 학습 과정 함수화
def train_and_save_model(protein_type, data_filepath, model_save_dir, num_epochs=5, ratio=1/2, batch_size=200000):
    for batch_num in tqdm(range(50), desc="Batches"):
        print(f"Processing batch {batch_num + 1} / 50")
        
        # 데이터 샘플링
        sampled_df = load_and_sample_data_batch(data_filepath, protein_type, ratio, batch_size)
        
        print(f"Sampled {len(sampled_df)} data points for batch {batch_num + 1}")
        
        # 데이터셋 생성
        dataset = SMILESDataSet(sampled_df)
        
        # 데이터셋 나누기 (학습/검증)
        train_idx, val_idx = train_test_split(np.arange(len(dataset)), test_size=0.2, random_state=42)
        train_dataset = torch.utils.data.Subset(dataset, train_idx)
        val_dataset = torch.utils.data.Subset(dataset, val_idx)
        
        print(f"Train dataset size: {len(train_dataset)}, Validation dataset size: {len(val_dataset)}")
        
        # DataLoader 생성
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        
        # 모델, 손실 함수, 최적화 알고리즘 정의
        model = Net()
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        # 모델 학습
        print(f"Training {protein_type} model on batch {batch_num + 1} ...")
        train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs)
        
        # 모델 저장
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)
        
        model_save_path = os.path.join(model_save_dir, f'{protein_type}_model_batch{batch_num + 1}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pth')
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved to {model_save_path}")
        
        # 가비지 컬렉션 수행
        del sampled_df, dataset, train_dataset, val_dataset, model, criterion, optimizer
        gc.collect()


In [2]:
# 함수 호출 예제
data_filepath = './train.parquet'
model_save_dir = './models'

# 사용 예시
train_and_save_model(
    protein_type='HSA',
    data_filepath = './train.parquet',
    model_save_dir = './models',
    num_epochs=5,
    ratio=1/2,
    batch_size=200000
)

# # BRD4 모델 학습 및 저장
# train_and_save_model('BRD4', data_filepath, model_save_dir)

# # HSA 모델 학습 및 저장
# train_and_save_model('HSA', data_filepath, model_save_dir)

# # sEH 모델 학습 및 저장
# train_and_save_model('sEH', data_filepath, model_save_dir)


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Processing batch 1 / 50
Loading data for protein type: HSA, batch size: 200000
Sampled 200000 data points for batch 1
Dataframe columns: Index(['id', 'buildingblock1_smiles', 'buildingblock2_smiles',
       'buildingblock3_smiles', 'molecule_smiles', 'protein_name', 'binds'],
      dtype='object')
Train dataset size: 160000, Validation dataset size: 40000
Training HSA model on batch 1 ...


Epoch 1/5:  67%|██████▋   | 3326/5000 [9:23:27<4:43:35, 10.16s/it, loss=0.703]
Batches:   0%|          | 0/50 [9:23:39<?, ?it/s]


KeyboardInterrupt: 

test코드 1DCNN 추가 - 실행버전

In [5]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# GPU 설정 확인 및 메모리 사용량 조절
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("TensorFlow GPU memory growth enabled")
    except RuntimeError as e:
        print(e)
else:
    print("TensorFlow GPU not available")

# 데이터베이스 연결
con = duckdb.connect(database=':memory:')

# 테스트 데이터 경로 설정
test_parquet_path = 'test_processed_data.parquet'

# 각 protein에 대한 최적의 조합을 설정
best_combinations = {
    'BRD4': [('elu', 'relu')],
    'HSA': [('relu', 'elu')],
    'sEH': [('elu', 'relu')]
}

# 사용할 모델 종류 지정
model_types = ['catboost', 'cnn']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
model_types = ['cnn']  # 여기서 원하는 모델 종류를 리스트로 선택하세요

# 결과 저장을 위한 리스트
results = []

batch_size = 20000  # 적절한 배치 크기로 설정

# 메모리 정리 함수
def clear_memory():
    gc.collect()
    cp._default_memory_pool.free_all_blocks()
    print("Memory cleared.")

# 1D-CNN 모델 정의 함수
def OneDCNN_model(input_len, num_classes):
    hidden_dim = 128
    num_filters = 32

    inputs = tf.keras.layers.Input(shape=(input_len,))
    x = Embedding(input_dim=36, output_dim=hidden_dim, input_length=input_len, mask_zero=True)(inputs)
    x = Conv1D(filters=num_filters, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = Conv1D(filters=num_filters*2, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = Conv1D(filters=num_filters*3, kernel_size=3, activation='relu', padding='valid', strides=1)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    loss = 'binary_crossentropy'
    weighted_metrics = [tf.keras.metrics.AUC(curve='PR', name='avg_precision')]
    model.compile(
        loss=loss,
        optimizer=optimizer,
        weighted_metrics=weighted_metrics,
    )
    return model

for protein in best_combinations.keys():
    # 총 행 수 계산
    total_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{test_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch_df = con.execute(f"""
            SELECT * FROM read_parquet('{test_parquet_path}')
            WHERE protein_name = '{protein}'
            LIMIT {batch_size} OFFSET {start}
        """).df()

        # 테스트 데이터 전처리
        X_test = np.concatenate([
            np.array(batch_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)

        for model_type in model_types:
            for activation_1, activation_2 in best_combinations[protein]:
                if model_type == 'xgboost':
                    model = XGBClassifier()
                    model_filename = f"{protein}_xgboost_model.pkl"
                elif model_type == 'randomforest':
                    model = RandomForestClassifier()
                    model_filename = f"{protein}_rf_model_iteration_1.pkl"
                elif model_type == 'catboost':
                    model = CatBoostClassifier()
                    model_filename = f"{protein}_catboost_model_iteration_1.pkl"
                elif model_type == 'lstm':
                    input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
                    hidden_dim = 128
                    output_dim = 1

                    model = Sequential([
                        Input(shape=(None, input_dim)),
                        LSTM(hidden_dim * 8, return_sequences=True, activation=activation_1, dropout=0.05),
                        LSTM(hidden_dim * 4, return_sequences=True, activation=activation_2, dropout=0.05),
                        LSTM(hidden_dim * 1, return_sequences=False, activation=activation_1, dropout=0.05),
                        Dense(output_dim, activation='sigmoid')
                    ])
                    model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"
                elif model_type == 'cnn':
                    input_len = 1024 * 4
                    num_classes = 1
                    model = OneDCNN_model(input_len, num_classes)
                    #model_filename = f"{protein}_cnn_model_iteration_1.h5"
                    model_filename = f"{protein} - CNN.h5"

                # 모델 로드
                if model_type in ['lstm', 'cnn']:
                    model.load_weights(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein} with {activation_1} and {activation_2}")
                else:
                    model = joblib.load(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein}")

                # 예측 수행
                if model_type == 'lstm':
                    X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
                    y_pred = model.predict(X_test_lstm)
                elif model_type == 'randomforest':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'xgboost':
                    X_test_gpu = cp.array(X_test)
                    y_pred = model.predict_proba(X_test_gpu)[:, 1]
                    y_pred = cp.asnumpy(y_pred)
                elif model_type == 'catboost':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'cnn':
                    y_pred = model.predict(X_test)

                # 결과 저장
                test_results = pd.DataFrame({
                    'id': batch_df['id'],
                    'binds': y_pred.flatten()
                })
                results.append(test_results)

                # 메모리 정리
                clear_memory()

# 모든 결과를 하나의 DataFrame으로 결합
final_results_df = pd.concat(results, ignore_index=True)

# 결과 저장
final_results_df.to_csv('test_results.csv', index=False)
print("Test results saved to 'test_results.csv'")

con.close()


TensorFlow GPU memory growth enabled
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loaded CNN model for BRD4 with elu and relu
Memory cleared.
Loa

PCA 시도

In [3]:
import gc
import duckdb
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib
import os

# Initialize the IncrementalPCA and StandardScaler
n_components = 142
ipca = IncrementalPCA(n_components=n_components, batch_size=1000)
scaler = StandardScaler()

# Connect to DuckDB and read the data in chunks
con = duckdb.connect()

proteins = ["HSA", "sEH"]# "BRD4", 

for protein in proteins:
    # Determine total number of rows for progress tracking
    total_rows = con.execute(f"SELECT COUNT(*) FROM 'train_enc_{protein}.parquet'").fetchone()[0]

    # First pass: fit the scaler and IPCA
    chunksize = 2097152
    offset = 0
    
    ipca_model_path = f'ipca_model_{protein}.pkl'
    scaler_model_path = f'scaler_model_{protein}.pkl'

    if not os.path.exists(ipca_model_path) and not os.path.exists(scaler_model_path):
        print("Fitting the scaler and IncrementalPCA...")
        for offset in tqdm(range(0, total_rows, chunksize)):
            # Read a chunk of data
            query = f"SELECT * FROM 'train_enc_BRD4.parquet' LIMIT {chunksize} OFFSET {offset}"
            chunk = con.execute(query).fetchdf()
            if chunk.empty:
                break
            
            X_chunk = chunk.drop(columns=['bind'])
            scaler.partial_fit(X_chunk)
            
            X_scaled = scaler.transform(X_chunk)
            ipca.partial_fit(X_scaled)
            
            # Collect garbage to free memory
            del chunk, X_chunk, X_scaled
            gc.collect()
        joblib.dump(ipca, f'ipca_model_{protein}.pkl')
        joblib.dump(scaler, f'scaler_model_{protein}.pkl')
    else:
        # Load the saved models
        ipca = joblib.load(f'ipca_model_{protein}.pkl')
        scaler = joblib.load(f'scaler_model_{protein}.pkl')

    # Directory to save chunk results
    output_dir = f'pca_chunks_{protein}'
    os.makedirs(output_dir, exist_ok=True)

    chunk_files = []

    # Second pass: transform the data
    print("Transforming the data using IncrementalPCA...")
    for offset in tqdm(range(0, total_rows, chunksize)):
        # Read a chunk of data
        query = f"SELECT * FROM 'train_enc_{protein}.parquet' LIMIT {chunksize} OFFSET {offset}"
        chunk = con.execute(query).fetchdf()
        if chunk.empty:
            break
        
        X_chunk = chunk.drop(columns=['bind'])
        y_chunk = chunk['bind']
        
        X_scaled = scaler.transform(X_chunk)
        pcs = ipca.transform(X_scaled)
        
        pca_columns = [f'PC{i}' for i in range(n_components)]
        pca_chunk_df = pd.DataFrame(data=pcs, columns=pca_columns)
        pca_chunk_df['bind'] = y_chunk.values
        
        chunk_file = os.path.join(output_dir, f'pca_chunk_{protein}_{offset}.csv')
        pca_chunk_df.to_csv(chunk_file, index=False)
        chunk_files.append(chunk_file)
        
        # Collect garbage to free memory
        del chunk, X_chunk, y_chunk, X_scaled, pcs, pca_chunk_df
        gc.collect()

    # Merge all chunk files into one DataFrame
    print("Merging all chunks into a single DataFrame...")
    # merged_df = pd.concat([pd.read_csv(file) for file in chunk_files])

        # 병합을 포기함 용량이너무 많음

    # # 파일을 처음 생성할 때 컬럼 헤더를 포함하여 작성
    # is_first_chunk = True
    # merged_output_file = f'transformed_pca_data_{protein}.csv'

    # # 병합할 청크 수 (메모리 상황에 맞게 조절)
    # merge_batch_size = 4

    # # 임시 저장 파일 리스트
    # temp_files = []

    # # tqdm을 사용하여 진행률 표시
    # for i in tqdm(range(0, len(chunk_files), merge_batch_size), desc="Merging chunks in batches"):
    #     batch_files = chunk_files[i:i + merge_batch_size]
    #     batch_df = pd.concat((pd.read_csv(os.path.join(f'pca_chunk_{protein}', file)) for file in batch_files))
    #     temp_file = f'temp_merged_{i // merge_batch_size}.csv'
    #     batch_df.to_csv(temp_file, index=False)
    #     temp_files.append(temp_file)
    #     del batch_df  # 데이터프레임 삭제
    #     gc.collect()  # 가비지 컬렉터 호출

    # # 최종 병합
    # is_first_chunk = True
    # for temp_file in tqdm(temp_files, desc="Final merging"):
    #     temp_df = pd.read_csv(temp_file)
    #     temp_df.to_csv(merged_output_file, mode='a', index=False, header=is_first_chunk)
    #     is_first_chunk = False
    #     del temp_df  # 데이터프레임 삭제
    #     gc.collect()  # 가비지 컬렉터 호출

    # # 임시 파일 삭제
    # for temp_file in temp_files:
    #     os.remove(temp_file)

    # print(f"All PCA results saved to {merged_output_file}")

    # merged_df = pd.read_csv(merged_output_file)
    # # Calculate the correlation between principal components and bind
    # correlation = merged_df.corr()['bind'][:-1]

    # # Plot the correlation of each principal component with bind
    # plt.figure(figsize=(12, 6))
    # plt.bar(correlation.index, correlation.values)
    # plt.xlabel('Principal Components')
    # plt.ylabel('Correlation with bind')
    # plt.title('Correlation of Principal Components with bind')
    # plt.xticks(rotation=90)
    # plt.show()

    # # Display the correlations
    # correlation_df = correlation.reset_index()
    # correlation_df.columns = ['Principal Component', 'Correlation with bind']

    # # Sort by the absolute value of the correlation
    # correlation_df = correlation_df.sort_values(by='Correlation with bind', key=abs, ascending=False)

    # # Output the top 10 principal components most correlated with bind
    # print("Top 10 principal components most correlated with bind:")
    # print(correlation_df.head(10))

    # # Output the top 10 principal components least correlated with bind
    # print("Top 10 principal components least correlated with bind:")
    # print(correlation_df.tail(10))

Fitting the scaler and IncrementalPCA...


100%|██████████| 47/47 [35:00<00:00, 44.68s/it]


Transforming the data using IncrementalPCA...


100%|██████████| 47/47 [3:01:53<00:00, 232.20s/it]  


Merging all chunks into a single DataFrame...
Fitting the scaler and IncrementalPCA...


100%|██████████| 47/47 [35:04<00:00, 44.77s/it]


Transforming the data using IncrementalPCA...


100%|██████████| 47/47 [2:54:22<00:00, 222.61s/it]  

Merging all chunks into a single DataFrame...





로딩 스코어 저장

In [6]:
import gc
import duckdb
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm import tqdm
import joblib
import os

# IncrementalPCA와 StandardScaler 초기화
n_components = 142

# DuckDB 연결
con = duckdb.connect()

proteins = ["BRD4", "HSA", "sEH"]

# 청크 크기 설정
chunksize = 100000

for protein in proteins:
    # 결과 파일 초기화
    loadings_output_file = f'loadings_values_{protein}.csv'
    scores_output_file = f'scores_values_{protein}.csv'
    if os.path.exists(loadings_output_file):
        os.remove(loadings_output_file)
    if os.path.exists(scores_output_file):
        os.remove(scores_output_file)

    # 저장된 모델 로드
    ipca_model_path = f'ipca_model_{protein}.pkl'
    scaler_model_path = f'scaler_model_{protein}.pkl'

    ipca = joblib.load(ipca_model_path)
    scaler = joblib.load(scaler_model_path)

    # PCA 결과 청크 파일들이 있는 폴더 경로
    pca_chunks_folder = f'pca_chunks_{protein}'

    # 주성분 로딩(loading) 값을 저장
    loadings = pd.DataFrame(ipca.components_.T, columns=[f'PC{i}' for i in range(n_components)])
    loadings['feature'] = [f'feature_{i}' for i in range(loadings.shape[0])]
    loadings = loadings.set_index('feature')
    
    if os.path.exists(loadings_output_file):
        loadings.to_csv(loadings_output_file, mode='a', header=False)
    else:
        loadings.to_csv(loadings_output_file, mode='w', header=True)

    print(f"로딩 값이 {loadings_output_file}에 저장되었습니다.")

    offset = 0
    while True:
        # 해당 단백질의 테스트 데이터 청크 단위로 선택
        query = f"""
        SELECT * FROM 'test_enc.parquet' 
        WHERE protein_name = '{protein}'
        LIMIT {chunksize} OFFSET {offset}
        """
        protein_test_df = con.execute(query).fetchdf()

        if protein_test_df.empty:
            break

        # 테스트 데이터에 동일한 변환 적용
        X_test = protein_test_df.drop(columns=['protein_name', 'id'])

        X_test_scaled = scaler.transform(X_test)
        X_test_pcs = ipca.transform(X_test_scaled)

        # 테스트 데이터프레임에 PCA 결과 추가
        pca_columns = [f'PC{i}' for i in range(n_components)]
        test_pca_df = pd.DataFrame(data=X_test_pcs, columns=pca_columns)

        # Scores 값을 저장
        scores = test_pca_df.copy()
        scores['id'] = protein_test_df['id']
        
        if os.path.exists(scores_output_file):
            scores.to_csv(scores_output_file, mode='a', header=False, index=False)
        else:
            scores.to_csv(scores_output_file, mode='w', header=True, index=False)

        offset += chunksize
        gc.collect()

    print(f"Scores 값이 {scores_output_file}에 저장되었습니다.")

print(f"모든 단백질에 대한 로딩 값과 Scores 값이 각각 {loadings_output_file}와 {scores_output_file}에 저장되었습니다.")


로딩 값이 loadings_values_BRD4.csv에 저장되었습니다.
Scores 값이 scores_values_BRD4.csv에 저장되었습니다.
로딩 값이 loadings_values_HSA.csv에 저장되었습니다.
Scores 값이 scores_values_HSA.csv에 저장되었습니다.
로딩 값이 loadings_values_sEH.csv에 저장되었습니다.
Scores 값이 scores_values_sEH.csv에 저장되었습니다.
모든 단백질에 대한 로딩 값과 Scores 값이 각각 loadings_values_sEH.csv와 scores_values_sEH.csv에 저장되었습니다.


테스트에 적용

In [1]:
import gc
import duckdb
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm import tqdm
import joblib
import os

# IncrementalPCA와 StandardScaler 초기화
n_components = 142

# DuckDB 연결
con = duckdb.connect()

proteins = ["BRD4", "HSA", "sEH"]

# 청크 크기 설정
chunksize = 100000

# 결과 파일 초기화
output_file = 'uncorrelated_test_zero_data.csv'
if os.path.exists(output_file):
    os.remove(output_file)

for protein in proteins:

    # correlation 저장 파일 초기화
    correlation_file = F'correlation_values_{protein}.csv'
    if os.path.exists(correlation_file):
        os.remove(correlation_file)
    # 저장된 모델 로드
    ipca_model_path = f'ipca_model_{protein}.pkl'
    scaler_model_path = f'scaler_model_{protein}.pkl'

    ipca = joblib.load(ipca_model_path)
    scaler = joblib.load(scaler_model_path)

    # PCA 결과 청크 파일들이 있는 폴더 경로
    pca_chunks_folder = f'pca_chunks_{protein}'

    # 상관관계가 낮은 주성분 찾기
    correlation = pd.Series(dtype=float)
    chunk_files = [f for f in os.listdir(pca_chunks_folder) if f.endswith('.csv')]
    for chunk_file in tqdm(chunk_files, desc=f'Processing PCA chunks for {protein}'):
        chunk_df = pd.read_csv(os.path.join(pca_chunks_folder, chunk_file))
        chunk_corr = chunk_df.corr()['bind'][:-1]
        correlation = pd.concat([correlation, chunk_corr])

    correlation = correlation.groupby(correlation.index).mean()

    # correlation을 파일에 저장
    correlation_df = pd.DataFrame({
        'protein': [protein] * len(correlation),
        'pc': correlation.index,
        'correlation': correlation.values
    })
    
    if os.path.exists(correlation_file):
        correlation_df.to_csv(correlation_file, mode='a', header=False, index=False)
    else:
        correlation_df.to_csv(correlation_file, mode='w', header=True, index=False)

    # bind와 상관관계가 낮은 주성분 찾기
    threshold = 0.001  # 상관관계 임계값 (절대값 기준)
    uncorrelated_pcs = correlation[correlation.abs() < threshold].sort_values().index.tolist()

    print(f"bind 값과 상관관계가 낮은 주성분 ({protein}):")
    print(uncorrelated_pcs)

    offset = 0
    first_example_found = False
    while True:
        # 해당 단백질의 테스트 데이터 청크 단위로 선택
        query = f"""
        SELECT * FROM 'test_enc.parquet' 
        WHERE protein_name = '{protein}'
        LIMIT {chunksize} OFFSET {offset}
        """
        protein_test_df = con.execute(query).fetchdf()

        if protein_test_df.empty:
            break

        # 테스트 데이터에 동일한 변환 적용
        X_test = protein_test_df.drop(columns=['protein_name', 'id'])

        X_test_scaled = scaler.transform(X_test)
        X_test_pcs = ipca.transform(X_test_scaled)

        # 테스트 데이터프레임에 PCA 결과 추가
        pca_columns = [f'PC{i}' for i in range(n_components)]
        test_pca_df = pd.DataFrame(data=X_test_pcs, columns=pca_columns)

        # 상관관계가 낮은 주성분을 사용하여 데이터 탐지
        uncorrelated_test_data = test_pca_df[uncorrelated_pcs]

        # 각 모델에서 상관관계가 낮은 PC 값을 기반으로 0으로 판정된 행을 찾기
        zero_rows = uncorrelated_test_data[(uncorrelated_test_data.abs() < threshold).all(axis=1)]

        if not zero_rows.empty:
            if not first_example_found:
                first_example_id = protein_test_df.loc[zero_rows.index[0], 'id']
                print(f"첫 번째 0으로 판정된 예시 ({protein}): id = {first_example_id}")
                first_example_found = True

            # 해당 행을 저장
            zero_ids = protein_test_df.loc[zero_rows.index, 'id']
            zero_df = pd.DataFrame({'id': zero_ids, 'binds': 0})
            zero_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

        offset += chunksize
        gc.collect()

print(f"테스트 데이터에서 모든 모델에서 0으로 판정된 데이터가 {output_file}에 저장되었습니다.")


Processing PCA chunks for BRD4: 100%|██████████| 47/47 [1:11:03<00:00, 90.71s/it]


bind 값과 상관관계가 낮은 주성분 (BRD4):
['PC4', 'PC6', 'PC53', 'PC72', 'PC67', 'PC13', 'PC64', 'PC54', 'PC65', 'PC140', 'PC88', 'PC43', 'PC80', 'PC108', 'PC113', 'PC66', 'PC96', 'PC45', 'PC5', 'PC107', 'PC78', 'PC109', 'PC112', 'PC62', 'PC79', 'PC131', 'PC122', 'PC124', 'PC138', 'PC121', 'PC134', 'PC128', 'PC85', 'PC127', 'PC99', 'PC102', 'PC92', 'PC117', 'PC75', 'PC120', 'PC136', 'PC129', 'PC135', 'PC132', 'PC110', 'PC83', 'PC130', 'PC139', 'PC73', 'PC93', 'PC126', 'PC82', 'PC116', 'PC118', 'PC103', 'PC104', 'PC89', 'PC111', 'PC137', 'PC94', 'PC61', 'PC133', 'PC98', 'PC105', 'PC123', 'PC63', 'PC90', 'PC91', 'PC34', 'PC77', 'PC86', 'PC125', 'PC50', 'PC95', 'PC101', 'PC3', 'PC30', 'PC41', 'PC24', 'PC81', 'PC23', 'PC38', 'PC115', 'PC71', 'PC12', 'PC70', 'PC9', 'PC16', 'PC44', 'PC74']


Processing PCA chunks for HSA: 100%|██████████| 47/47 [1:12:32<00:00, 92.61s/it]


bind 값과 상관관계가 낮은 주성분 (HSA):
['PC75', 'PC59', 'PC14', 'PC73', 'PC74', 'PC105', 'PC30', 'PC22', 'PC92', 'PC91', 'PC78', 'PC117', 'PC99', 'PC102', 'PC101', 'PC86', 'PC120', 'PC90', 'PC97', 'PC93', 'PC94', 'PC89', 'PC124', 'PC113', 'PC111', 'PC110', 'PC121', 'PC126', 'PC80', 'PC139', 'PC107', 'PC137', 'PC132', 'PC82', 'PC103', 'PC138', 'PC123', 'PC79', 'PC135', 'PC85', 'PC125', 'PC127', 'PC136', 'PC98', 'PC133', 'PC128', 'PC7', 'PC108', 'PC134', 'PC104', 'PC129', 'PC83', 'PC131', 'PC130', 'PC81', 'PC122', 'PC112', 'PC109', 'PC116', 'PC115', 'PC88', 'PC49', 'PC8', 'PC95', 'PC118', 'PC52', 'PC21', 'PC45', 'PC87', 'PC48', 'PC55']


Processing PCA chunks for sEH: 100%|██████████| 47/47 [1:12:49<00:00, 92.96s/it]


bind 값과 상관관계가 낮은 주성분 (sEH):
['PC64', 'PC12', 'PC61', 'PC22', 'PC39', 'PC21', 'PC85', 'PC140', 'PC43', 'PC84', 'PC7', 'PC87', 'PC133', 'PC78', 'PC139', 'PC89', 'PC97', 'PC111', 'PC127', 'PC136', 'PC98', 'PC92', 'PC108', 'PC123', 'PC135', 'PC88', 'PC77', 'PC81', 'PC102', 'PC138', 'PC110', 'PC132', 'PC103', 'PC106', 'PC129', 'PC82', 'PC112', 'PC131', 'PC117', 'PC126', 'PC71', 'PC105', 'PC109', 'PC124', 'PC122', 'PC137', 'PC115', 'PC128', 'PC130', 'PC121', 'PC8', 'PC66', 'PC90', 'PC125', 'PC80', 'PC120', 'PC34', 'PC116', 'PC96', 'PC91', 'PC101', 'PC107', 'PC134', 'PC94', 'PC14', 'PC58', 'PC55', 'PC25', 'PC45', 'PC118', 'PC57', 'PC53', 'PC24', 'PC38']
테스트 데이터에서 모든 모델에서 0으로 판정된 데이터가 uncorrelated_test_zero_data.csv에 저장되었습니다.


이제 임계값만 조정해서 적용

In [14]:
import gc
import duckdb
import pandas as pd
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import joblib
import os
import numpy as np

# DuckDB 연결
con = duckdb.connect()

proteins = ["BRD4", "HSA", "sEH"]

# 청크 크기 설정
chunksize = 100000

# 결과 파일 초기화
output_file = 'logistic_regression_predictions.csv'
if os.path.exists(output_file):
    os.remove(output_file)

for protein in proteins:
    # 저장된 모델 로드
    scaler_model_path = f'scaler_model_{protein}.pkl'
    scaler = joblib.load(scaler_model_path)

    # correlation 파일 불러오기
    correlation_file = f'correlation_values_{protein}.csv'
    correlation_df = pd.read_csv(correlation_file)
    correlation_df = correlation_df.set_index('pc')
    correlation = correlation_df['correlation']

    # 상관관계가 낮은 주성분 찾기
    threshold = 0.00001  # 상관관계 임계값 (절대값 기준)
    uncorrelated_pcs = correlation[correlation.abs() < threshold].sort_values().index.tolist()

    if not uncorrelated_pcs:
        print(f"{protein}: No uncorrelated principal components found with threshold {threshold}.")
        continue

    print(f"bind 값과 상관관계가 낮은 주성분 ({protein}):")
    print(uncorrelated_pcs)

    # Logistic Regression 모델 초기화
    logistic_model = LogisticRegression()
    loadings_file = f'loadings_values_{protein}.csv'
    scores_file = f'scores_values_{protein}.csv'

    loadings_df = pd.read_csv(loadings_file)
    scores_df = pd.read_csv(scores_file)

    # 학습 데이터 로드 및 전처리 (청크 단위)
    offset = 0
    train_query = f"SELECT COUNT(*) FROM 'train_enc_{protein}.parquet'"
    total_train_rows = con.execute(train_query).fetchone()[0]

    with tqdm(total=total_train_rows, desc=f"Loading and processing training data for {protein}") as pbar:
        while offset < total_train_rows:
            train_query = f"""
            SELECT * FROM 'train_enc_{protein}.parquet'
            LIMIT {chunksize} OFFSET {offset}
            """
            train_df = con.execute(train_query).fetchdf()

            X_train_chunk = train_df.drop(columns=['bind'])
            y_train_chunk = train_df['bind']

            # 열 수가 맞지 않을 경우 처리
            if X_train_chunk.shape[1] != loadings_df.shape[0]:
                raise ValueError(f"Mismatch in number of features: {X_train_chunk.shape[1]} in X_train_chunk, {loadings_df.shape[0]} in loadings_df")

            X_train_scaled_chunk = scaler.transform(X_train_chunk)
            if isinstance(X_train_scaled_chunk, list):
                X_train_scaled_chunk = np.array(X_train_scaled_chunk, dtype=float)  # Ensure X_train_scaled_chunk is a numpy array
            else:
                X_train_scaled_chunk = X_train_scaled_chunk.astype(float)

            X_train_pcs_chunk = X_train_scaled_chunk.dot(loadings_df.iloc[:, :-1].values)
            train_pca_df_chunk = pd.DataFrame(data=X_train_pcs_chunk, columns=[f'PC{i}' for i in range(X_train_pcs_chunk.shape[1])])

            if offset == 0:
                X_train_uncorrelated = train_pca_df_chunk[uncorrelated_pcs]
                y_train = y_train_chunk
            else:
                X_train_uncorrelated = pd.concat([X_train_uncorrelated, train_pca_df_chunk[uncorrelated_pcs]], ignore_index=True)
                y_train = pd.concat([y_train, y_train_chunk], ignore_index=True)

            offset += chunksize
            pbar.update(len(train_df))
            gc.collect()

    # Logistic Regression 모델 학습
    logistic_model.fit(X_train_uncorrelated, y_train)
    print(f"{protein} Logistic Regression 모델 학습 완료.")

    # 테스트 데이터 처리 및 예측
    offset = 0
    total_rows = con.execute(f"SELECT COUNT(*) FROM 'test_enc.parquet' WHERE protein_name = '{protein}'").fetchone()[0]
    with tqdm(total=total_rows, desc=f"Processing {protein}") as pbar:
        while True:
            # 해당 단백질의 테스트 데이터 청크 단위로 선택
            query = f"""
            SELECT * FROM 'test_enc.parquet' 
            WHERE protein_name = '{protein}'
            LIMIT {chunksize} OFFSET {offset}
            """
            protein_test_df = con.execute(query).fetchdf()

            if protein_test_df.empty:
                break

            # 테스트 데이터에 동일한 변환 적용
            X_test = protein_test_df.drop(columns=['protein_name', 'id'])
            X_test_scaled = scaler.transform(X_test)
            if isinstance(X_test_scaled, list):
                X_test_scaled = np.array(X_test_scaled, dtype=float)  # Ensure X_test_scaled is a numpy array
            else:
                X_test_scaled = X_test_scaled.astype(float)

            # 열 수가 맞지 않을 경우 처리
            if X_test.shape[1] != loadings_df.shape[0]:
                raise ValueError(f"Mismatch in number of features: {X_test.shape[1]} in X_test, {loadings_df.shape[0]} in loadings_df")

            # 로딩 값을 이용하여 주성분 점수 계산
            X_test_pcs = X_test_scaled.dot(loadings_df.iloc[:, :-1].values)
            test_pca_df = pd.DataFrame(data=X_test_pcs, columns=[f'PC{i}' for i in range(X_test_pcs.shape[1])])

            # 상관관계 낮은 주성분 데이터 선택
            X_test_uncorrelated = test_pca_df[uncorrelated_pcs]

            # 예측 수행
            predictions = logistic_model.predict(X_test_uncorrelated)

            # 예측 결과 저장
            result_df = pd.DataFrame({
                'id': protein_test_df['id'],
                'protein_name': protein,
                'predicted_bind': predictions
            })
            result_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

            offset += chunksize
            pbar.update(len(protein_test_df))
            gc.collect()

print(f"테스트 데이터에 대한 예측 결과가 {output_file}에 저장되었습니다.")


bind 값과 상관관계가 낮은 주성분 (BRD4):
['PC92', 'PC117', 'PC75', 'PC120', 'PC136', 'PC129']


Loading and processing training data for BRD4:   0%|          | 0/98415610 [00:19<?, ?it/s]


TypeError: can't multiply sequence by non-int of type 'float'

Train에 GNNs 적용(그래프 네트워크 모델) 취소
다른 방향으로 전처리 진행


In [None]:
import pandas as pd

input_parquet_path = './train.parquet'
protein = 'BRD4'

# 데이터 쿼리 실행
df = con.execute(f"""
(SELECT * FROM parquet_scan('{input_parquet_path}') WHERE binds = 1 ORDER BY RANDOM() LIMIT 10)
UNION All
(SELECT * FROM parquet_scan('{input_parquet_path}') WHERE binds = 0 ORDER BY RANDOM() LIMIT 10)
"""
).fetchdf()

# 데이터프레임 셔플
df = df.sample(frac=1).reset_index(drop=True)

df


학습&테스트 새 기준 randomforest & catboost

In [42]:
import duckdb
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import joblib
from catboost import CatBoostClassifier
from tqdm import tqdm

def smiles_to_features(smiles_list):
    features = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            features.extend([Descriptors.MolWt(mol), Descriptors.NumRotatableBonds(mol), Descriptors.TPSA(mol)])
        else:
            features.extend([0, 0, 0])  # If invalid SMILES, append zeros
    return features

def get_model(model_type):
    if model_type == 'random_forest':
        return RandomForestClassifier(n_estimators=100, random_state=42)
    elif model_type == 'catboost':
        return CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=10, random_seed=42, verbose=, task_type='GPU')
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def process_protein(input_parquet_path, protein, model_path, model_type):
    # 데이터 쿼리 실행
    df = con.execute(f"""
    (SELECT * FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}' AND binds = 1 ORDER BY RANDOM() LIMIT 40000)
    UNION ALL
    (SELECT * FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}' AND binds = 0 ORDER BY RANDOM() LIMIT 40000)
    """).fetchdf()

    # 데이터프레임 셔플
    df = df.sample(frac=1).reset_index(drop=True)

    # SMILES 및 레이블 데이터를 준비합니다.
    smiles_data = df[['molecule_smiles', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']].values  # 여러 SMILES 문자열 리스트
    labels = df['binds']  # 합성 가능성 레이블 (0 또는 1)

    # 피처 추출
    features = [smiles_to_features(smiles) for smiles in tqdm(smiles_data, desc=f"Extracting features for {protein}")]

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # 모델 선택 및 학습
    model = get_model(model_type)
    model.fit(X_train, y_train)

    # 모델 저장
    joblib.dump(model, model_path)
    print(f"Model for {protein} with {model_type} saved to {model_path}")

    # 평가
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Results for {protein} with {model_type}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print()

def test_proteins(test_parquet_path, proteins, output_csv_path, model_type):
    # 전체 예측 결과를 저장할 리스트
    all_results = []

    # 모델을 불러오고 예측을 수행
    for protein in tqdm(proteins, desc="Testing proteins"):
        model_path = f'{model_type}_model_{protein}.pkl'

        # 데이터 쿼리 실행
        df = con.execute(f"""
        SELECT * FROM parquet_scan('{test_parquet_path}') WHERE protein_name = '{protein}'
        """).fetchdf()

        # SMILES 데이터를 준비합니다.
        smiles_data = df[['molecule_smiles', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']].values  # 여러 SMILES 문자열 리스트
        ids = df['id']  # id 컬럼

        # 피처 추출
        features = [smiles_to_features(smiles) for smiles in tqdm(smiles_data, desc=f"Extracting features for {protein} (test)")]

        # 모델 불러오기
        model = joblib.load(model_path)

        # 예측 확률
        y_pred_proba = model.predict_proba(features)[:, 1]

        # 결과를 데이터프레임으로 구성
        results_df = pd.DataFrame({'id': ids, 'binds': y_pred_proba})
        all_results.append(results_df)

    # 모든 결과를 하나의 데이터프레임으로 합치기
    final_results_df = pd.concat(all_results, ignore_index=True)

    # CSV 파일로 저장
    final_results_df.to_csv(output_csv_path, index=False)
    print(f"All test predictions saved to {output_csv_path}")

# 데이터베이스 연결 열기
con = duckdb.connect()

input_parquet_path = './train.parquet'
test_parquet_path = './test.parquet'
proteins = ['BRD4', 'HSA', 'sEH']
output_csv_path = 'test_predictions_combined.csv'
model_type = 'catboost'  # 'random_forest' 또는 'catboost' 중 하나

# 각 단백질에 대해 모델을 학습하고 저장
for protein in tqdm(proteins, desc="Processing proteins"):
    model_path = f'{model_type}_model_{protein}.pkl'
    process_protein(input_parquet_path, protein, model_path, model_type)

# 테스트 파일에 대해 예측하고 결과를 합쳐서 저장
test_proteins(test_parquet_path, proteins, output_csv_path, model_type)


All test predictions saved to test_predictions_combined.csv


뎁스별 cat boost 검토

In [51]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from tqdm import tqdm

def get_model_with_depth(depth, task_type='GPU'):
    return CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=depth, random_seed=42, verbose=0, task_type=task_type)
def process_protein_with_depth(input_parquet_path, protein, model_path, depth):
    # 데이터 쿼리 실행
    df = con.execute(f"""
    (SELECT * FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}' AND binds = 1 ORDER BY RANDOM() LIMIT 40000)
    UNION ALL
    (SELECT * FROM parquet_scan('{input_parquet_path}') WHERE protein_name = '{protein}' AND binds = 0 ORDER BY RANDOM() LIMIT 160000)
    """).fetchdf()
    print("데이터 쿼리 완료")
    # 데이터프레임 셔플
    df = df.sample(frac=1).reset_index(drop=True)

    # SMILES 및 레이블 데이터를 준비합니다.
    smiles_data = df[['molecule_smiles', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']].values
    labels = df['binds']
    print("smile 데이터 준비 완료")
    # 피처 추출
    features = [smiles_to_features(smiles) for smiles in tqdm(smiles_data, desc=f"Extracting features for {protein}")]
    
    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    print("피처 추출 및 데이터 분할 완료")
    # 모델 선택 및 학습
    model = get_model_with_depth(depth)
    model.fit(X_train, y_train)

    # 모델 저장
    joblib.dump(model, model_path)
    print(f"Model for {protein} with depth {depth} saved to {model_path}")

    # 평가
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Results for {protein} with depth {depth}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

# 테스트할 depth 값들
depth_values = [10]

for depth in depth_values:
    print(f"Testing CatBoost with depth: {depth}")
    for protein in tqdm(proteins, desc="Processing proteins"):
        model_path = f'catboost_model_{protein}_depth{depth}.pkl'
        process_protein_with_depth(input_parquet_path, protein, model_path, depth)

# 테스트 파일에 대해 예측하고 결과를 합쳐서 저장
test_proteins(test_parquet_path, proteins, output_csv_path, 'catboost')


Testing CatBoost with depth: 10


RuntimeError: Query interrupted

In [3]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
from tqdm import tqdm
import random
con = duckdb.connect()
train_path = './train.parquet'
#train_path = './train_enc_sEH.parquet'
#train_path = './train_enc_HSA.parquet'

# 데이터 쿼리 실행
df = con.execute(f"""
(SELECT * FROM parquet_scan('{train_path}') WHERE binds = 1 LIMIT 10000)
""").fetchdf()
df

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,466,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Cc1cc2cc(CN)ccc2[nH]1,C#CCOc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[C...,HSA,1
1,467,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Cc1cc2cc(CN)ccc2[nH]1,C#CCOc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[C...,sEH,1
2,683,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Cl.NCC12CC3CC(CC(C3)C1)C2,C#CCOc1ccc(CNc2nc(NCC34CC5CC(CC(C5)C3)C4)nc(N[...,sEH,1
3,1321,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Nc1n[nH]c2ncccc12,C#CCOc1ccc(CNc2nc(Nc3n[nH]c4ncccc34)nc(N[C@@H]...,HSA,1
4,2141,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1cccc(CN)c1.Cl,Cc1cc2cc(CN)ccc2[nH]1,C#CCOc1cccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[...,sEH,1
...,...,...,...,...,...,...,...
9995,5065194,C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1cc(N)ccc1F,NCc1cc(F)cc(F)c1,C#CC[C@H](Nc1nc(NCc2cc(F)cc(F)c2)nc(Nc2ccc(F)c...,BRD4,1
9996,5066638,C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1ccc(N)c([N+](=O)[O-])c1,Cc1cc2cc(CN)ccc2[nH]1,C#CC[C@H](Nc1nc(NCc2ccc3[nH]c(C)cc3c2)nc(Nc2cc...,HSA,1
9997,5066935,C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1ccc(N)c([N+](=O)[O-])c1,Cl.Cl.NCc1cncc(F)c1,C#CC[C@H](Nc1nc(NCc2cncc(F)c2)nc(Nc2ccc(C#N)cc...,HSA,1
9998,5066973,C#CC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,N#Cc1ccc(N)c([N+](=O)[O-])c1,Cl.Cn1cc(N)ccc1=O,C#CC[C@H](Nc1nc(Nc2ccc(=O)n(C)c2)nc(Nc2ccc(C#N...,BRD4,1


최종 Test 진행

자꾸램이터져서 test 코드 수정

In [4]:
import duckdb
import pandas as pd
import numpy as np
import joblib
import cupy as cp
import gc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# GPU 설정 확인 및 메모리 사용량 조절
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print("TensorFlow GPU memory growth enabled")
    except RuntimeError as e:
        print(e)
else:
    print("TensorFlow GPU not available")

# 데이터베이스 연결
con = duckdb.connect(database=':memory:')

# 테스트 데이터 경로 설정
test_parquet_path = 'test_processed_data.parquet'

# 각 protein에 대한 최적의 조합을 설정
best_combinations = {
    'BRD4': [('elu', 'relu')],
    'HSA': [('relu', 'elu')],
    'sEH': [('elu', 'relu')]
}

# 사용할 모델 종류 지정
model_types = ['lstm']  # 여기서 원하는 모델 종류를 리스트로 선택하세요
# model_types = ['xgboost']
# model_types = ['randomforest']

# 결과 저장을 위한 리스트
results = []

batch_size = 20000  # 적절한 배치 크기로 설정

# 메모리 정리 함수
def clear_memory():
    gc.collect()
    cp._default_memory_pool.free_all_blocks()
    print("Memory cleared.")

for protein in best_combinations.keys():
    # 총 행 수 계산
    total_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{test_parquet_path}') WHERE protein_name = '{protein}'").fetchone()[0]

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch_df = con.execute(f"""
            SELECT * FROM read_parquet('{test_parquet_path}')
            WHERE protein_name = '{protein}'
            LIMIT {batch_size} OFFSET {start}
        """).df()

        # 테스트 데이터 전처리
        X_test = np.concatenate([
            np.array(batch_df['molecule_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock1_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock2_ecfp'].tolist(), dtype=np.float32),
            np.array(batch_df['buildingblock3_ecfp'].tolist(), dtype=np.float32)
        ], axis=1)

        for model_type in model_types:
            for activation_1, activation_2 in best_combinations[protein]:
                if model_type == 'xgboost':
                    model = XGBClassifier()
                    model_filename = f"{protein}_xgb_model_iteration_1.pkl"
                elif model_type == 'randomforest':
                    model = RandomForestClassifier()
                    model_filename = f"{protein}_rf_model_iteration_1.pkl"
                elif model_type == 'lstm':
                    input_dim = 1024 * 4  # ECFP 길이에 맞게 조정
                    hidden_dim = 128
                    output_dim = 1

                    model = Sequential([
                        Input(shape=(None, input_dim)),
                        LSTM(hidden_dim * 8, return_sequences=True, activation=activation_1, dropout=0.05),
                        LSTM(hidden_dim * 4, return_sequences=True, activation=activation_2, dropout=0.05),
                        LSTM(hidden_dim * 1, return_sequences=False, activation=activation_1, dropout=0.05),
                        Dense(output_dim, activation='sigmoid')
                    ])
                    model_filename = f"{protein}_lstm_model_{activation_1}_{activation_2}_{activation_1}_iteration_1.weights.h5"

                # 모델 로드
                if model_type == 'lstm':
                    model.load_weights(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein} with {activation_1} and {activation_2}")
                else:
                    model = joblib.load(model_filename)
                    print(f"Loaded {model_type.upper()} model for {protein}")

                # 예측 수행
                if model_type == 'lstm':
                    X_test_lstm = np.expand_dims(X_test, axis=1)  # LSTM이 기대하는 3D 입력으로 변환
                    y_pred = model.predict(X_test_lstm)
                elif model_type == 'randomforest':
                    y_pred = model.predict_proba(X_test)[:, 1]
                elif model_type == 'xgboost':
                    X_test_gpu = cp.array(X_test)
                    y_pred = model.predict_proba(X_test_gpu)[:, 1]
                    y_pred = cp.asnumpy(y_pred)

                # 결과 저장
                test_results = pd.DataFrame({
                    'id': batch_df['id'],
                    'binds': y_pred.flatten()
                })
                results.append(test_results)

                # 메모리 정리
                clear_memory()

# 모든 결과를 하나의 DataFrame으로 결합
final_results_df = pd.concat(results, ignore_index=True)

# 결과 저장
final_results_df.to_csv('test_results.csv', index=False)
print("Test results saved to 'test_results.csv'")

con.close()


TensorFlow GPU memory growth enabled
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Memory cleared.
Loaded LSTM model for BRD4 with elu and relu
Mem

GNNs 테스트 모델

추출 병합 ecfp적용에서
추출 ecfp 정용 병합으로 방향을 전환(속도 개선)

bind = 0을 랜덤 추출

In [None]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
import os

def query_and_save_chunks(train_path, output_prefix, total_records=1589906, chunk_size=100000):
    # DuckDB 연결
    con = duckdb.connect()
    
    remaining_records = total_records
    offset = 0
    chunk_id = 13
    
    while remaining_records > 0:
        fetch_size = min(chunk_size, remaining_records)
        
        # 데이터 쿼리
        df = con.execute(f"""
        SELECT * FROM parquet_scan('{train_path}') WHERE binds = 0 ORDER BY RANDOM() LIMIT {fetch_size} OFFSET {offset}
        """).fetchdf()
        
        if df.empty:
            break
        
        # 청크 파일로 저장
        chunk_path = f"{output_prefix}_chunk_{chunk_id}.parquet"
        table = pa.Table.from_pandas(df)
        pq.write_table(table, chunk_path)
        print(f"Data saved to {chunk_path}, fetched: {fetch_size}, remaining: {remaining_records}")
        
        # 메모리 해제
        del df, table
        gc.collect()
        
        offset += fetch_size
        remaining_records -= fetch_size
        chunk_id += 1

# 파일 경로 및 설정
train_path = './train.parquet'
output_prefix = './0_queried_data'
final_output_path = './final_queried_data.parquet'
total_records = 1589906
chunk_size = 100000

# 데이터 쿼리 및 청크 저장 실행
query_and_save_chunks(train_path, output_prefix, total_records, chunk_size)

# 청크 파일 병합 실행
# total_chunks = (total_records + chunk_size - 1) // chunk_size
# merge_chunks(output_prefix, final_output_path, total_chunks)

# print("Data querying and merging completed.")


추출된 파일들을 병합

In [7]:
import duckdb
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import gc
import os

def merge_chunks(output_prefix, final_output_path, total_chunks):
    schema = None
    unique_data = pd.DataFrame()
    
    for chunk_id in range(total_chunks):
        chunk_path = f"{output_prefix}_chunk_{chunk_id}.parquet"
        table = pq.read_table(chunk_path)
        df = table.to_pandas()
        
        # 중복 제거 및 데이터 합치기
        unique_data = pd.concat([unique_data, df]).drop_duplicates().reset_index(drop=True)
        
        # 메모리 해제 및 청크 파일 삭제
        del table, df
        os.remove(chunk_path)
        gc.collect()
        print(f"Chunk {chunk_id} processed and merged.")
    
    # 최종 파일로 저장
    final_table = pa.Table.from_pandas(unique_data)
    pq.write_table(final_table, final_output_path)
    print(f"Final data saved to {final_output_path}")
    
    # 메모리 해제
    del unique_data, final_table
    gc.collect()

# 파일 경로 및 설정
train_path = './train.parquet'
output_prefix = './0_queried_data'
final_output_path = './final_queried_data3.parquet'
total_records = 1600000#1589906
chunk_size = 100000

# 청크 파일 병합 실행
total_chunks = (total_records + chunk_size - 1) // chunk_size
merge_chunks(output_prefix, final_output_path, total_chunks)

print("Data querying and merging completed.")


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001AAA383B490>>
Traceback (most recent call last):
  File "c:\Users\gksru\anaconda3\envs\belka39\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Chunk 0 processed and merged.
Chunk 1 processed and merged.
Chunk 2 processed and merged.
Chunk 3 processed and merged.
Chunk 4 processed and merged.
Chunk 5 processed and merged.
Chunk 6 processed and merged.
Chunk 7 processed and merged.
Chunk 8 processed and merged.
Chunk 9 processed and merged.
Chunk 10 processed and merged.
Chunk 11 processed and merged.
Chunk 12 processed and merged.
Chunk 13 processed and merged.
Chunk 14 processed and merged.
Chunk 15 processed and merged.
Final data saved to ./final_queried_data3.parquet
Data querying and merging completed.


추출/병합된 파일을 ecfp로 변환

In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import gc
from tqdm import tqdm

# ECFP 생성 함수
def generate_ecfp(smiles, radius=2, bits=1024):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return [0] * bits
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

def process_batch(df):
    # 각 SMILES 열에 대해 ECFP 생성
    df['molecule_ecfp'] = df['molecule_smiles'].apply(generate_ecfp)
    df['buildingblock1_ecfp'] = df['buildingblock1_smiles'].apply(generate_ecfp)
    df['buildingblock2_ecfp'] = df['buildingblock2_smiles'].apply(generate_ecfp)
    df['buildingblock3_ecfp'] = df['buildingblock3_smiles'].apply(generate_ecfp)
    
    # 필요한 열만 포함된 DataFrame 반환
    return df[['id', 'protein_name', 'molecule_ecfp', 'buildingblock1_ecfp', 'buildingblock2_ecfp', 'buildingblock3_ecfp', 'binds']]

def preprocess_and_save_ecfp(input_path, output_path, batch_size=32768):
    reader = pq.ParquetFile(input_path)
    
    # 적절한 스키마로 Parquet writer 초기화
    schema = pa.schema([
        ('id', pa.int32()),
        ('molecule_ecfp', pa.list_(pa.int32())),
        ('buildingblock1_ecfp', pa.list_(pa.int32())),
        ('buildingblock2_ecfp', pa.list_(pa.int32())),
        ('buildingblock3_ecfp', pa.list_(pa.int32())),
        ('protein_name', pa.string()),
        ('binds', pa.int32())  # test할 때는 제외
    ])
    
    with pq.ParquetWriter(output_path, schema) as writer:
        total_batches = reader.metadata.num_row_groups
        
        with tqdm(total=total_batches, desc="Processing", unit="batch", leave=True) as pbar:
            for batch in reader.iter_batches(batch_size=batch_size):
                df_batch = batch.to_pandas()
                
                processed_batch = process_batch(df_batch)
                
                # 처리된 DataFrame을 Arrow Table로 변환하여 파일에 작성
                table = pa.Table.from_pandas(processed_batch, schema=schema)
                writer.write_table(table)
                
                # 진행 상황 업데이트
                pbar.update(1)

                # 주기적으로 가비지 컬렉션 호출
                gc.collect()

# 파일 경로 설정
#input_parquet_path = './merged_shuffled_data.parquet'
input_parquet_path = './final_queried_data3.parquet'
output_parquet_path = './ecfp_queried_data.parquet'
# 테스트 파일 경로
# input_parquet_path = './test.parquet'
# output_parquet_path = './test_processed_data.parquet'

# 데이터 전처리 및 저장 실행
preprocess_and_save_ecfp(input_parquet_path, output_parquet_path)

print("ECFP generation and saving completed.")


Processing: 49batch [45:51, 56.15s/batch]                   


ECFP generation and saving completed.


: 

In [None]:
원본 파일과 병합 후 순서를 다 섞기 -> 램이 부족한지 작동하지 않음

In [1]:
# import pyarrow.parquet as pq
# import pyarrow as pa
# import pandas as pd
# import gc

# def merge_and_shuffle_parquets(input_path1, input_path2, output_path):
#     # 첫 번째 Parquet 파일 읽기
#     table1 = pq.read_table(input_path1)
#     df1 = table1.to_pandas()
    
#     # 두 번째 Parquet 파일 읽기
#     table2 = pq.read_table(input_path2)
#     df2 = table2.to_pandas()
    
#     # 데이터 프레임 병합
#     combined_df = pd.concat([df1, df2], ignore_index=True)
    
#     # 데이터 프레임 셔플
#     shuffled_df = combined_df.sample(frac=1).reset_index(drop=True)
    
#     # 결과를 Parquet 파일로 저장
#     shuffled_table = pa.Table.from_pandas(shuffled_df)
#     pq.write_table(shuffled_table, output_path)
    
#     print(f"Data from {input_path1} and {input_path2} merged and shuffled, saved to {output_path}")
    
#     # 메모리 해제
#     del df1, df2, combined_df, shuffled_df, table1, table2, shuffled_table
#     gc.collect()

# # 파일 경로 설정
# input_path1 = 'processed_merged_queried_data.parquet'
# input_path2 = './ecfp_queried_data.parquet'#'./queried_data.parquet'
# output_path = 'processed_merged_queried_data.parquet'

# # 데이터 병합 및 셔플 실행
# merge_and_shuffle_parquets(input_path1, input_path2, output_path)

# print("Data merging and shuffling completed.")


In [2]:
import heapq
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
import gc  # Garbage Collection 모듈 임포트

def external_sort_and_merge_with_shuffle(input_path1, input_path2, output_path, batch_size=200000):
    # 파일 읽기
    reader1 = pq.ParquetFile(input_path1)
    reader2 = pq.ParquetFile(input_path2)
    
    # 최소 힙 초기화
    min_heap = []
    schema = reader1.schema.to_arrow_schema()
    
    # 파일에서 배치 읽기 및 힙에 삽입
    def process_batches(reader):
        for batch in reader.iter_batches(batch_size=batch_size):
            dict_data = batch.to_pydict()
            # 각 열 리스트를 행으로 조합하여 힙에 삽입
            for row in zip(*dict_data.values()):
                heapq.heappush(min_heap, row)
            gc.collect()  # 배치 처리 후 메모리 정리

    process_batches(reader1)
    process_batches(reader2)
    
    # 모든 데이터를 리스트로 변환
    all_data = []
    while min_heap:
        all_data.append(heapq.heappop(min_heap))

    # 데이터 셔플
    np.random.shuffle(all_data)
    
    # 셔플된 데이터를 새 파일에 저장
    with pq.ParquetWriter(output_path, schema) as writer:
        for data in all_data:
            # 행 데이터를 Arrow Table로 변환
            table = pa.Table.from_pydict(dict(zip(schema.names, [[value] for value in data])), schema=schema)
            writer.write_table(table)
            gc.collect()  # 데이터 쓰기 후 메모리 정리

# 파일 경로 설정
input_path1 = 'processed_merged_queried_data.parquet'
input_path2 = './ecfp_queried_data.parquet'#'./queried_data.parquet'
output_path = 'processed_merged_queried_data1.parquet'

# 외부 정렬 실행
external_sort_and_merge_with_shuffle(input_path1, input_path2, output_path)


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000026776EFB490>>
Traceback (most recent call last):
  File "c:\Users\gksru\anaconda3\envs\belka39\lib\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
KeyboardInterrupt: 


잘 섞였는지 검토

In [2]:
import duckdb
con = duckdb.connect()

df = con.query(f"""
SELECT * FROM parquet_scan('./processed_merged_queried_data.parquet') LIMIT 1000       
""").df()
df


모든 컬럼의 데이터가 동일한 중복 항을 제거

In [None]:
import duckdb
con = duckdb.connect()

# 동일한 모든 컬럼 데이터를 가진 행을 제거하는 쿼리
query = """
WITH UniqueRows AS (
    SELECT *,
    ROW_NUMBER() OVER (PARTITION BY * ORDER BY *) as rn
    FROM parquet_scan('./processed_merged_queried_data.parquet')
)
SELECT * FROM UniqueRows
WHERE rn = 1
"""

# 제거된 데이터를 DataFrame으로 로드
df = con.execute(query).df()

# 결과 DataFrame을 파켓 파일로 저장
output_path = './processed_merged_queried_data.parquet'
df.to_parquet(output_path, index=False)

print("Cleaned data has been saved to:", output_path)



몇대 몇인지비율 검토

In [None]:
import duckdb
con = duckdb.connect()

# 전체 데이터 중에서 binds=1 및 binds=0의 갯수를 계산
query = """
SELECT 
    SUM(CASE WHEN binds = 1 THEN 1 ELSE 0 END) AS binds_1_count,
    SUM(CASE WHEN binds = 0 THEN 1 ELSE 0 END) AS binds_0_count,
    COUNT(*) AS total_count
FROM parquet_scan('./processed_merged_queried_data.parquet')
"""

df = con.execute(query).df()

# 전체 행의 수를 사용하여 비율 계산
df['ratio_binds_1'] = df['binds_1_count'] / df['total_count']
df['ratio_binds_0'] = df['binds_0_count'] / df['total_count']

df


Score 점수를 기준으로 재구성

In [5]:
pip install natsort
score_df = score_df[['filename', 'score']]

Collecting natsort
  Downloading natsort-8.4.0-py3-none-any.whl.metadata (21 kB)
Downloading natsort-8.4.0-py3-none-any.whl (38 kB)
Installing collected packages: natsort
Successfully installed natsort-8.4.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from tqdm import tqdm  # tqdm 라이브러리 추가
from natsort import natsorted  # natsort 라이브러리 추가

# 55개의 제출 파일이 있는 디렉토리
directory = './submissions'

# score 데이터를 읽어오기
score_df = pd.read_csv('score.csv')  # score.csv 파일 경로 수정 필요
score_dict = dict(zip(score_df['filename'], score_df['score']))  # 파일명과 score를 딕셔너리로 변환

# 랜덤 포레스트 모델 초기화
model = RandomForestRegressor(n_estimators=100, random_state=42)

# 데이터를 배치로 처리하여 모델 학습
batch_size = 250000  # 한 번에 처리할 데이터의 크기
submission_files = natsorted(os.listdir(directory))  # 자연스럽게 파일명을 정렬

print(submission_files)
print("Starting model training...")

for filename in tqdm(submission_files, desc="Processing submission files"):
    if filename.endswith(".csv"):  # 파일 형식에 맞게 수정 필요
        file_path = os.path.join(directory, filename)
        score = score_dict[filename]  # 현재 파일의 score 값 가져오기
        
        # 파일을 배치로 나누어 처리
        for chunk in pd.read_csv(file_path, chunksize=batch_size):
            X_temp = chunk[['binds']]
            y_temp = np.full(X_temp.shape[0], score)  # score 값을 반복하여 배열 생성
            
            # 모델 학습
            model.fit(X_temp, y_temp)

print("Model training completed.")

# 최적의 binds 값 예측
print("Predicting optimal binds value...")
X_new = pd.DataFrame({'binds': np.linspace(0, 1, 1000)})  # 0에서 1 사이의 1000개의 값을 가진 데이터프레임 생성
y_new_pred = model.predict(X_new)  # 모델을 사용하여 각 binds 값에 대한 예측 수행

# 예측 결과 시각화
plt.plot(X_new['binds'], y_new_pred, color='red')
plt.xlabel('Binds')
plt.ylabel('Predicted Score')
plt.title('Predicted Score vs Binds')
plt.show()

# 최적의 binds 값
optimal_binds = X_new.iloc[np.argmax(y_new_pred)]['binds']
print(f'Optimal binds value: {optimal_binds}')


['submission (0).csv', 'submission (1).csv', 'submission (2).csv', 'submission (3).csv', 'submission (4).csv', 'submission (5).csv', 'submission (6).csv', 'submission (7).csv', 'submission (8).csv', 'submission (9).csv', 'submission (10).csv', 'submission (11).csv', 'submission (12).csv', 'submission (13).csv', 'submission (14).csv', 'submission (15).csv', 'submission (16).csv', 'submission (17).csv', 'submission (18).csv', 'submission (19).csv', 'submission (20).csv', 'submission (21).csv', 'submission (22).csv', 'submission (23).csv', 'submission (24).csv', 'submission (25).csv', 'submission (26).csv', 'submission (27).csv', 'submission (28).csv', 'submission (29).csv', 'submission (30).csv', 'submission (31).csv', 'submission (32).csv', 'submission (33).csv', 'submission (34).csv', 'submission (35).csv', 'submission (36).csv', 'submission (37).csv', 'submission (38).csv', 'submission (39).csv', 'submission (40).csv', 'submission (41).csv', 'submission (42).csv', 'submission (43).csv

Processing submission files:   2%|▏         | 1/55 [01:08<1:02:02, 68.93s/it]


KeyboardInterrupt: 

In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from tqdm import tqdm  # tqdm 라이브러리 추가
from natsort import natsorted  # natsort 라이브러리 추가

# 55개의 제출 파일이 있는 디렉토리
directory = './submissions'

# score 데이터를 읽어오기
score_df = pd.read_csv('score.csv')  # score.csv 파일 경로 수정 필요
score_dict = dict(zip(score_df['filename'], score_df['score']))  # 파일명과 score를 딕셔너리로 변환

# 랜덤 포레스트 모델 초기화
model = RandomForestRegressor(n_estimators=100, random_state=42)

# 데이터를 처리하여 모델 학습
submission_files = natsorted(os.listdir(directory))  # 자연스럽게 파일명을 정렬

print(submission_files)
print("Starting model training...")

for filename in tqdm(submission_files, desc="Processing submission files"):
    if filename.endswith(".csv"):  # 파일 형식에 맞게 수정 필요
        file_path = os.path.join(directory, filename)
        score = score_dict[filename]  # 현재 파일의 score 값 가져오기
        
        # 파일 전체를 읽어서 처리
        df = pd.read_csv(file_path)
        X_temp = df[['binds']]
        y_temp = np.full(X_temp.shape[0], score)  # score 값을 반복하여 배열 생성
        
        # 모델 학습
        model.fit(X_temp, y_temp)

print("Model training completed.")

# 최적의 binds 값 예측
print("Predicting optimal binds value...")
X_new = pd.DataFrame({'binds': np.linspace(0, 1, 1000)})  # 0에서 1 사이의 1000개의 값을 가진 데이터프레임 생성
y_new_pred = model.predict(X_new)  # 모델을 사용하여 각 binds 값에 대한 예측 수행

# 예측 결과 시각화
plt.plot(X_new['binds'], y_new_pred, color='red')
plt.xlabel('Binds')
plt.ylabel('Predicted Score')
plt.title('Predicted Score vs Binds')
plt.show()

# 최적의 binds 값
optimal_binds = X_new.iloc[np.argmax(y_new_pred)]['binds']
print(f'Optimal binds value: {optimal_binds}')


['submission (0).csv', 'submission (1).csv', 'submission (2).csv', 'submission (3).csv', 'submission (4).csv', 'submission (5).csv', 'submission (6).csv', 'submission (7).csv', 'submission (8).csv', 'submission (9).csv', 'submission (10).csv', 'submission (11).csv', 'submission (12).csv', 'submission (13).csv', 'submission (14).csv', 'submission (15).csv', 'submission (16).csv', 'submission (17).csv', 'submission (18).csv', 'submission (19).csv', 'submission (20).csv', 'submission (21).csv', 'submission (22).csv', 'submission (23).csv', 'submission (24).csv', 'submission (25).csv', 'submission (26).csv', 'submission (27).csv', 'submission (28).csv', 'submission (29).csv', 'submission (30).csv', 'submission (31).csv', 'submission (32).csv', 'submission (33).csv', 'submission (34).csv', 'submission (35).csv', 'submission (36).csv', 'submission (37).csv', 'submission (38).csv', 'submission (39).csv', 'submission (40).csv', 'submission (41).csv', 'submission (42).csv', 'submission (43).csv

Processing submission files:   2%|▏         | 1/55 [01:46<1:35:53, 106.54s/it]


KeyboardInterrupt: 

submission 합치기 -> 최적 검토

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# combined_submissions.csv 파일을 읽어오기
combined_df = pd.read_csv('combined_submissions.csv')

# score 데이터를 읽어오기
score_df = pd.read_csv('score.csv')  # score.csv 파일 경로 수정 필요
score_dict = dict(zip(score_df['filename'], score_df['score']))  # 파일명과 score를 딕셔너리로 변환

# 컬럼명 생성 (binds_0, binds_1, ..., binds_54)
binds_columns = [col for col in combined_df.columns if col.startswith('binds_')]

# 각 row의 최적의 binds 값을 예측하기 위한 데이터프레임 생성
optimal_binds = []

print("Starting row-wise prediction...")

for idx, row in combined_df.iterrows():
    row_data = []
    for col in binds_columns:
        score = score_dict[col.replace('binds_', 'submission (') + ').csv']
        row_data.append((row[col], score))
    
    # 각 row의 binds 값과 score를 기반으로 최적의 binds 값을 예측
    optimal_binds_value = sum(bind_value * score for bind_value, score in row_data) / sum(score for _, score in row_data)
    optimal_binds.append(optimal_binds_value)

# 예측된 최적의 binds 값을 combined_df에 추가
combined_df['optimal_binds'] = optimal_binds

print("Row-wise prediction completed.")
print(combined_df[['id', 'optimal_binds']].head())  # 예측 결과의 상위 5개 행 출력

save_df = combined_df[['id', 'optimal_binds']]
# 예측된 결과를 저장 (선택 사항)
combined_df.to_csv('predicted_optimal_binds.csv', index=False)


Starting row-wise prediction...
Row-wise prediction completed.
          id  optimal_binds
0  295246830       0.109665
1  295246831       0.121072
2  295246832       0.122045
3  295246833       0.135941
4  295246834       0.164062


In [18]:
save_df.head()

Unnamed: 0,id,optimal_binds
0,295246830,0.109665
1,295246831,0.121072
2,295246832,0.122045
3,295246833,0.135941
4,295246834,0.164062


In [16]:
save_df = combined_df[['id', 'optimal_binds']]
# 예측된 결과를 저장 (선택 사항)
save_df.to_csv('predicted_optimal_binds3.csv', index=False)


베이지안 방법론

In [None]:
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
from tqdm import tqdm

# combined_submissions.csv 파일을 읽어오기
combined_df = pd.read_csv('combined_submissions.csv')

# score 데이터를 읽어오기
score_df = pd.read_csv('score.csv')  # score.csv 파일 경로 수정 필요
score_dict = dict(zip(score_df['filename'], score_df['score']))  # 파일명과 score를 딕셔너리로 변환

# 컬럼명 생성 (binds_0, binds_1, ..., binds_54)
binds_columns = [col for col in combined_df.columns if col.startswith('binds_')]

# 각 row의 최적의 binds 값을 예측하기 위한 데이터프레임 생성
optimal_binds = []

print("Starting row-wise prediction with Bayesian Optimization...")

for idx, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    def objective_function(**kwargs):
        """
        kwargs는 binds_0, binds_1, ..., binds_54에 해당하는 값으로 이루어져 있습니다.
        이를 사용하여 가중 평균을 계산하고 그 값을 반환합니다.
        """
        return np.sum([row[col] * score_dict[col.replace('binds_', 'submission (') + ').csv'] for col in kwargs.keys()])
    
    # 베이지안 최적화를 위한 파라미터 범위 설정
    pbounds = {col: (0, 1) for col in binds_columns}
    
    # 베이지안 최적화 수행
    optimizer = BayesianOptimization(
        f=objective_function,
        pbounds=pbounds,
        random_state=42,
    )
    
    # 초기 포인트와 최대 탐색 포인트 설정
    optimizer.maximize(
        init_points=5,
        n_iter=10,
    )
    
    # 최적의 binds 값 추출
    optimal_binds_value = {k: v for k, v in optimizer.max['params'].items()}
    
    # 최적의 binds 값을 평균하여 최종 값을 도출
    optimal_binds.append(np.mean(list(optimal_binds_value.values())))

# 예측된 최적의 binds 값을 combined_df에 추가
combined_df['optimal_binds'] = optimal_binds

print("Row-wise prediction completed.")
print(combined_df[['id', 'optimal_binds']].head())  # 예측 결과의 상위 5개 행 출력

# 예측된 결과를 저장 (선택 사항)
combined_df.to_csv('predicted_optimal_binds2.csv', index=False)
