In [None]:
import re
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import pickle as pkl
from tqdm import tqdm
from typing import List, Tuple

In [None]:
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_ATOMPAIR.pkl", "rb") as f:
    filt_feats_ATOMPAIR = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_AVALON.pkl", "rb") as f:
    filt_feats_AVALON = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_RDK.pkl", "rb") as f:
    filt_feats_RDK = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_ECFP4.pkl", "rb") as f:
    filt_feats_ECFP4 = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_ECFP6.pkl", "rb") as f:
    filt_feats_ECFP6 = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_FCFP4.pkl", "rb") as f:
    filt_feats_FCFP4 = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_FCFP6.pkl", "rb") as f:
    filt_feats_FCFP6 = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_MACCS.pkl", "rb") as f:
    filt_feats_MACCS = pkl.load(f)
with open("../model/COMBINED-5F-VAR-0-1/filt_feats_TOPTOR.pkl", "rb") as f:
    filt_feats_TOPTOR = pkl.load(f)

In [None]:
# Load the test data
test_data = pd.read_parquet("../data/Data/Step1_TestData_Target2035.parquet")
test_data

In [None]:
def get_df_from_feature_list_comp_indices(data, filt_feats_indices):
    # filt_feats_indices: list of integer indices.
    processed_rows = []
    for row_str in tqdm(data, total=len(data)): # tqdm can be kept if desired
        parts = row_str.split(',')
        # Select elements by index
        selected_parts = [parts[i] for i in filt_feats_indices]
        processed_rows.append(selected_parts)
    return pd.DataFrame(processed_rows)

In [None]:
test_ECFP4 = get_df_from_feature_list_comp_indices(test_data["ECFP4"], filt_feats_ECFP4)
test_ECFP6 = get_df_from_feature_list_comp_indices(test_data["ECFP6"], filt_feats_ECFP6)
test_FCFP4 = get_df_from_feature_list_comp_indices(test_data["FCFP4"], filt_feats_FCFP4)
test_FCFP6 = get_df_from_feature_list_comp_indices(test_data["FCFP6"], filt_feats_FCFP6)
test_ATOMPAIR = get_df_from_feature_list_comp_indices(test_data["ATOMPAIR"], filt_feats_ATOMPAIR)
test_AVALON = get_df_from_feature_list_comp_indices(test_data["AVALON"], filt_feats_AVALON)
test_RDK = get_df_from_feature_list_comp_indices(test_data["RDK"], filt_feats_RDK)
test_MACCS = get_df_from_feature_list_comp_indices(test_data["MACCS"], filt_feats_MACCS)
test_TOPTOR = get_df_from_feature_list_comp_indices(test_data["TOPTOR"], filt_feats_TOPTOR)

In [None]:
test_ECFP4.to_csv("../data/Data/STep1Test/test_ECFP4.csv", index=False)
test_ECFP6.to_csv("../data/Data/STep1Test/test_ECFP6.csv", index=False)
test_FCFP4.to_csv("../data/Data/STep1Test/test_FCFP4.csv", index=False)
test_FCFP6.to_csv("../data/Data/STep1Test/test_FCFP6.csv", index=False)
test_ATOMPAIR.to_csv("../data/Data/STep1Test/test_ATOMPAIR.csv", index=False)   
test_AVALON.to_csv("../data/Data/STep1Test/test_AVALON.csv", index=False)
test_RDK.to_csv("../data/Data/STep1Test/test_RDK.csv", index=False)
test_MACCS.to_csv("../data/Data/STep1Test/test_MACCS.csv", index=False)
test_TOPTOR.to_csv("../data/Data/STep1Test/test_TOPTOR.csv", index=False)

In [None]:
# Rename columns to include feature set names
test_ATOMPAIR.columns = [f"ATOMPAIR_{col}" for col in filt_feats_ATOMPAIR]
test_AVALON.columns = [f"AVALON_{col}" for col in filt_feats_AVALON]
test_RDK.columns = [f"RDK_{col}" for col in filt_feats_RDK]
test_ECFP4.columns = [f"ECFP4_{col}" for col in filt_feats_ECFP4]
test_ECFP6.columns = [f"ECFP6_{col}" for col in filt_feats_ECFP6]
test_FCFP4.columns = [f"FCFP4_{col}" for col in filt_feats_FCFP4]
test_FCFP6.columns = [f"FCFP6_{col}" for col in filt_feats_FCFP6]
test_MACCS.columns = [f"MACCS_{col}" for col in filt_feats_MACCS]
test_TOPTOR.columns = [f"TOPTOR_{col}" for col in filt_feats_TOPTOR]

In [None]:
# combine in order of:
# train_ATOMPAIR,
# train_AVALON,
# train_RDK,
# train_ECFP4,
# train_ECFP6,
# train_FCFP4,
# train_FCFP6,
# train_MACCS,
# train_TOPTOR,
test_combined = pd.concat([
    test_ATOMPAIR,
    test_AVALON,
    test_RDK,
    test_ECFP4,
    test_ECFP6,
    test_FCFP4,
    test_FCFP6,
    test_MACCS,
    test_TOPTOR
], axis=1)

In [None]:
test_combined

In [None]:
with open("../model/COMBINED-5F-VAR-0-1/norm_scaler.pkl", "rb") as f:
    norm_scaler = pkl.load(f)
test_combined_scaled = norm_scaler.transform(test_combined)
test_combined_scaled_df = pd.DataFrame(test_combined_scaled, columns=test_combined.columns)

In [None]:
# Load the model and make predictions
model_folds = "../model/COMBINED-5F-VAR-0-1/folds/"
folds = ["fold_1", "fold_2", "fold_3", "fold_4", "fold_5"]

results = []

for fold in tqdm(folds, total=len(folds)):
    model_path = f"{model_folds}{fold}/xgb_COMBINED_2000.pkl"
    with open(model_path, "rb") as f:
        model = pkl.load(f)

    
    # model.classes_ -> array([0, 1])
    
    predictions = model.predict(test_combined_scaled_df)

    probabilities = model.predict_proba(test_combined_scaled_df)[:, 1] # <- Probability of class 1 (Hit)

    fold_results = pd.DataFrame({
        "RandomID": test_data["RandomID"],
        "prediction": predictions,
        "probability": probabilities 
    })

    results.append(fold_results)

In [None]:
# Save the results
with open("./test-results.pkl", "wb") as f:
    pkl.dump(results, f)

In [None]:
fold_results

# Submission 1

Submitted Fold 5

In [None]:
# Average of predictions and probabilities across folds
final_predictions = pd.concat(results).groupby("RandomID").agg({
    "prediction": "mean",
    "probability": "mean"
}).reset_index()

In [None]:
# final_predictions['prediction'] is int
final_predictions['prediction'] = final_predictions['prediction'].astype(int)
final_predictions

In [None]:
# Sel_200 binary for top 200 hits
final_predictions['Sel_200'] = 0
top_200_ids = final_predictions.nlargest(200, 'probability')['RandomID']
final_predictions.loc[final_predictions['RandomID'].isin(top_200_ids), 'Sel_200'] = 1
final_predictions

In [None]:
# Sel_500 binary for top 500 hits
final_predictions['Sel_500'] = 0
top_500_ids = final_predictions.nlargest(500, 'probability')['RandomID']
final_predictions.loc[final_predictions['RandomID'].isin(top_500_ids), 'Sel_500'] = 1
final_predictions

In [None]:
# rename "probabilities" to "Score"
final_predictions.rename(columns={"probability": "Score"}, inplace=True)
# Save the final predictions to a CSV file
final_predictions.to_csv("../data/Data/STep1Test/final_predictions.csv", index=False)

In [None]:
final_predictions_ = final_predictions.drop(columns=["prediction"])
final_predictions_
# column order = ["RandomID", "Sel_200", "Sel_500", "Score"]
final_predictions_ordered = final_predictions_[["RandomID", "Sel_200", "Sel_500", "Score"]]
final_predictions_ordered

In [None]:
# Natural sort key function to sort by RandomID
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

final_predictions_ordered_ = final_predictions_ordered.sort_values(by="RandomID", key=lambda x: x.map(natural_sort_key))
final_predictions_ordered_

In [None]:
# Quick QC
final_predictions_ordered_[final_predictions_ordered_["Score"] > 0.5]

In [None]:
final_predictions_ordered_.to_csv("../data/Data/STep1Test/TeamKutumLab-T035.csv", index=False)