In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("data/ligand-qsar/raw/alkylamine-ligand-modeling-unprocessed.tsv", sep="\t")
df = df[df["buchwald-type"] > 0]

# Define bins and labels
bins = [-np.inf, 15, np.inf]
labels = [0, 1]
transformer = preprocessing.FunctionTransformer(pd.cut, kw_args={"bins": bins, "labels": labels, "retbins": False})

# Standardize features
scaler = StandardScaler()
X = df.drop(columns=["ligand_1_name", "product_1_yield", "buchwald-type"])
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Transform target
y = df["product_1_yield"]
y_bin = transformer.fit_transform(y)

In [2]:
import numpy as np


def calcDrop(res: pd.DataFrame) -> list[str]:
    """Calculate which columns to drop based on correlation matrix."""
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res["v1"].tolist() + res["v2"].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res["drop"].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[res["v1"].isin(keep) | res["v2"].isin(keep)][["v1", "v2"]]
    q = list(set(p["v1"].tolist() + p["v2"].tolist()))
    drop = list(set(q).difference(set(keep)))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[res["v1"].isin(poss_drop) | res["v2"].isin(poss_drop)][["v1", "v2", "drop"]]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m["v1"].isin(drop) & ~m["v2"].isin(drop)]["drop"]))
    for item in more_drop:
        drop.append(item)

    return drop


def corrX_new(df: pd.DataFrame, cut: float = 0.9):
    """Calculate the correlation matrix and return the columns to drop."""
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis=1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(bool))

    dropcols = []

    res = []

    for row in range(len(up) - 1):
        col_idx = row + 1
        for col in range(col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = [
                    corr_mtx.index[row],
                    up.columns[col],
                    avg_corr.iloc[row],
                    avg_corr.iloc[col],
                    up.iloc[row, col],
                    drop,
                ]

                res.append(s)

    res = pd.DataFrame(res, columns=(["v1", "v2", "v1.target", "v2.target", "corr", "drop"]))

    dropcols_names = calcDrop(res)

    return dropcols_names

In [5]:
run = True

if run:
    drop_cols = corrX_new(X, cut=0.90)

    X = X.drop(columns=drop_cols)

    print(sorted(drop_cols))
    print(len(drop_cols))

    df.drop(columns=drop_cols).to_csv("data/ligand-qsar/alkylamine-ligand-modeling-full.tsv", sep="\t", index=False)


['P_int_phosphine', 'buried_volume_3.5A', 'distance_Pd_N', 'distance_carb_OH_O', 'fukui_f_minus_carb_o', 'fukui_f_minus_ipso_carbon', 'fukui_f_plus_amine_nitrogen', 'fukui_f_plus_aryl_carbon', 'fukui_f_plus_avg_amine_proton', 'fukui_f_zero_metal', 'max_buried_volume_5.0A', 'max_buried_volume_ipso_3.5A', 'max_distance_carb_OH_O', 'max_fukui_f_minus_avg_amine_proton', 'max_fukui_f_minus_carb_o', 'max_fukui_f_minus_metal', 'max_fukui_f_plus_ipso_carbon', 'max_fukui_f_plus_metal', 'max_fukui_f_zero_avg_amine_proton', 'max_partial_charge_amine_carbon', 'max_partial_charge_amine_nitrogen', 'max_partial_charge_carbon', 'max_quadrant_buried_volume_ligand_max', 'max_quadrant_total_volume_ligand_range_max', 'min_buried_volume_5.0A', 'min_distance_carb_OH_O', 'min_fukui_f_minus_amine_nitrogen', 'min_fukui_f_minus_carb_o', 'min_fukui_f_plus_carb_o', 'min_fukui_f_zero_carb_oh', 'min_fukui_f_zero_ipso_carbon', 'min_global_nucleophilicity', 'min_partial_charge_carboxylic_oxygen', 'min_partial_charge_

In [7]:
import arfs.feature_selection.allrelevant as arfsgroot
import numpy as np
from lightgbm import LGBMClassifier

selected_features = []

seed = 1
model = LGBMClassifier(random_state=seed, verbose=-1, class_weight="balanced")

feat_selector = arfsgroot.BoostAGroota(estimator=model, importance="shap")
feat_selector.fit(X, y_bin)

selected_features.append(feat_selector.selected_features_)

# feat_selector.plot_importance(n_feat_per_inch=3)
# plt.show()
print(list(selected_features))

BoostaGRoota round:   0%|          | 0/500 [00:00<?, ?it/s]

[array(['buried_volume_2.0A', 'P_int_ipso_carbon',
       'min_distance_avg_amine_N_H', 'min_partial_charge_phosphine',
       'max_buried_volume_3.5A', 'max_P_int_ligand',
       'max_sasa_ligand_area', 'max_sterimol_ligand_L',
       'max_partial_charge_phosphine', 'max_fukui_f_zero_ipso_carbon'],
      dtype=object)]
