## Feature selection

The following code is used to remove highly correlated features from the dataset. As well, ARFS (all relevant feature selection) is used to select the most important features.

In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv(
    "data/ligand-qsar/raw/alkylamine-ligand-modeling-unprocessed.tsv", sep="\t"
)
df = df[df["buchwald-type"] > 0]
# df = df[~df["ligand_1_name"].isin(["L-149", "L-150"])]

# Define bins and labels
bins = [-np.inf, 15, np.inf]
labels = [0, 1]
transformer = preprocessing.FunctionTransformer(
    pd.cut, kw_args={"bins": bins, "labels": labels, "retbins": False}
)

# Standardize features
scaler = StandardScaler()
X = df.drop(columns=["ligand_1_name", "product_1_yield", "buchwald-type"])
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Transform target
y = df["product_1_yield"]
y_bin = transformer.fit_transform(y)

In [6]:
X

Unnamed: 0,buried_volume_2.0A,buried_volume_3.5A,buried_volume_5.0A,buried_volume_ipso_2.5A,buried_volume_ipso_3.5A,buried_volume_ipso_4.5A,quadrant_buried_volume_ligand_max,quadrant_buried_volume_ligand_min,quadrant_buried_volume_ligand_range_max,quadrant_total_volume_ligand_max,...,max_global_nucleophilicity,max_tolman_electronic_parameter,max_bond_order_Pd_P,max_bond_order_Pd_N,max_bond_order_Pd_C,max_bond_order_Pd_O,max_mlep_Pd_P,max_mlep_Pd_N,max_mlep_Pd_C,max_mlep_Pd_O
0,0.285371,-1.202029,-1.320527,-0.118346,0.226730,0.009689,0.153794,-0.927521,0.939573,-0.122957,...,0.744060,0.183099,1.281883,0.430271,-0.754441,0.476676,1.993526,0.124783,0.748102,0.329907
1,-0.462750,-1.518449,-1.815994,-0.110179,-0.346794,-0.352019,-0.440762,-0.729263,0.629022,0.234523,...,-0.824508,-0.430706,0.741380,-0.159093,-0.615404,-1.673683,0.857208,0.457397,1.171927,-0.983345
2,-0.083801,-1.375210,-1.452982,0.061452,0.345113,0.260604,-1.177833,-1.357967,1.101556,0.239214,...,-0.951809,0.004474,1.145068,0.138395,-0.661750,0.746786,1.404833,0.436284,1.127388,0.220255
3,-0.843059,-0.893106,-1.097534,-0.181918,0.348643,0.235625,-0.372195,-0.808004,0.719629,0.281560,...,-1.140901,-0.284717,0.602876,2.245137,1.840903,-1.764889,0.876337,1.189125,-0.626603,1.210251
4,0.108172,0.166137,-0.120034,0.967950,0.914312,0.738532,-0.627027,0.018525,-0.140647,-0.487605,...,0.980052,-1.000821,-0.094711,0.271236,-0.476368,0.339867,-0.435410,0.149570,-0.754980,-0.336858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1.817964,3.334617,2.681263,-0.848889,0.173400,0.362956,0.065273,2.197739,-2.142363,-0.409439,...,-0.141098,1.223999,-2.322598,-2.016992,-0.835545,-0.421353,-2.361452,-2.061276,-2.247384,-2.098540
92,1.558584,1.191234,0.983453,0.105934,0.781755,0.904786,0.706877,-0.887017,1.007892,1.135368,...,-0.237877,0.494703,0.648481,0.245042,-2.260667,0.080280,1.170648,0.743078,-0.383674,1.647440
93,-1.911934,0.421504,1.921386,-1.503225,-0.755635,-0.358202,0.587956,0.796486,-0.666189,-0.086968,...,-1.049333,2.510137,-0.221392,0.129040,0.218813,1.329102,-2.725775,-0.011974,-0.203626,-0.253103
94,-0.986052,0.957799,2.192589,-1.902370,-0.856476,-0.388654,0.631456,-0.515098,0.628454,1.405311,...,0.478289,1.356164,0.189053,-0.037478,0.740199,1.360673,-0.922431,0.892183,0.060698,-1.241293


In [7]:
X.loc[:, ~X.columns.str.startswith(("min", "max"))]

Unnamed: 0,buried_volume_2.0A,buried_volume_3.5A,buried_volume_5.0A,buried_volume_ipso_2.5A,buried_volume_ipso_3.5A,buried_volume_ipso_4.5A,quadrant_buried_volume_ligand_max,quadrant_buried_volume_ligand_min,quadrant_buried_volume_ligand_range_max,quadrant_total_volume_ligand_max,...,tolman_electronic_parameter,bond_order_Pd_P,bond_order_Pd_N,bond_order_Pd_C,bond_order_Pd_O,mlep_Pd_P,mlep_Pd_N,mlep_Pd_C,mlep_Pd_O,num_conformers
0,0.285371,-1.202029,-1.320527,-0.118346,0.226730,0.009689,0.153794,-0.927521,0.939573,-0.122957,...,0.183099,1.328640,-0.257475,-0.477423,0.063447,1.993526,0.124783,0.748102,0.329907,-0.034300
1,-0.462750,-1.518449,-1.815994,-0.110179,-0.346794,-0.352019,-0.440762,-0.729263,0.629022,0.234523,...,-0.430706,0.973793,0.183361,-0.165260,-1.179305,0.857208,0.457397,1.171927,-0.983345,-0.942658
2,-0.083801,-1.375210,-1.452982,0.061452,0.345113,0.260604,-1.177833,-1.357967,1.101556,0.239214,...,0.004474,1.165232,-0.014353,-0.532725,0.404438,1.404833,0.436284,1.127388,0.220255,-0.034300
3,-0.843059,-0.893106,-1.097534,-0.181918,0.348643,0.235625,-0.372195,-0.808004,0.719629,0.281560,...,-0.284717,0.770685,2.468098,1.512956,-1.567490,0.876337,1.189125,-0.626603,1.210251,-0.602024
4,0.108172,0.166137,-0.120034,0.967950,0.914312,0.738532,-0.627027,0.018525,-0.140647,-0.487605,...,-1.000821,-0.294553,-0.021190,-0.204087,0.034156,-0.435410,0.149570,-0.754980,-0.336858,1.441782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1.817964,3.334617,2.681263,-0.848889,0.173400,0.362956,0.065273,2.197739,-2.142363,-0.409439,...,1.223999,-2.034414,-1.997433,-0.387892,0.476349,-2.361452,-2.061276,-2.247384,-2.098540,-0.942658
92,1.558584,1.191234,0.983453,0.105934,0.781755,0.904786,0.706877,-0.887017,1.007892,1.135368,...,0.494703,0.882585,0.657733,-1.829138,1.139539,1.170648,0.743078,-0.383674,1.647440,-0.942658
93,-1.911934,0.421504,1.921386,-1.503225,-0.755635,-0.358202,0.587956,0.796486,-0.666189,-0.086968,...,2.510137,-1.610449,-0.250327,0.175539,2.608042,-2.725775,-0.011974,-0.203626,-0.253103,-0.602024
94,-0.986052,0.957799,2.192589,-1.902370,-0.856476,-0.388654,0.631456,-0.515098,0.628454,1.405311,...,1.356164,-1.048085,0.154650,0.335985,-0.233945,-0.922431,0.892183,0.060698,-1.241293,2.804319


In [2]:
import numpy as np


def calcDrop(res: pd.DataFrame) -> list[str]:
    """Calculate which columns to drop based on correlation matrix."""
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res["v1"].tolist() + res["v2"].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res["drop"].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[res["v1"].isin(keep) | res["v2"].isin(keep)][["v1", "v2"]]
    q = list(set(p["v1"].tolist() + p["v2"].tolist()))
    drop = list(set(q).difference(set(keep)))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[res["v1"].isin(poss_drop) | res["v2"].isin(poss_drop)][["v1", "v2", "drop"]]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m["v1"].isin(drop) & ~m["v2"].isin(drop)]["drop"]))
    for item in more_drop:
        drop.append(item)

    return drop


def corrX_new(df: pd.DataFrame, cut: float = 0.9):
    """Calculate the correlation matrix and return the columns to drop."""
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis=1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(bool))

    dropcols = []

    res = []

    for row in range(len(up) - 1):
        col_idx = row + 1
        for col in range(col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = [
                    corr_mtx.index[row],
                    up.columns[col],
                    avg_corr.iloc[row],
                    avg_corr.iloc[col],
                    up.iloc[row, col],
                    drop,
                ]

                res.append(s)

    res = pd.DataFrame(res, columns=(["v1", "v2", "v1.target", "v2.target", "corr", "drop"]))

    dropcols_names = calcDrop(res)

    return dropcols_names

In [3]:
run = True

if run:
    drop_cols = corrX_new(X, cut=0.95)

    X = X.drop(columns=drop_cols)

    print(sorted(drop_cols))
    print(len(drop_cols))

    df = df.drop(columns=drop_cols)
    df.to_csv("data/ligand-qsar/alkylamine-ligand-modeling.tsv", sep="\t", index=False)


['P_int_ligand', 'buried_volume_ipso_3.5A', 'buried_volume_ipso_4.5A', 'cone_angle', 'distance_Pd_P', 'global_electrophilicity', 'max_P_int_ipso_carbon', 'max_bond_order_Pd_P', 'max_buried_volume_ipso_2.5A', 'max_buried_volume_ipso_4.5A', 'max_cone_angle', 'max_fukui_f_plus_aryl_carbon', 'max_global_nucleophilicity', 'max_mlep_Pd_C', 'max_mlep_Pd_N', 'max_mlep_Pd_O', 'max_mlep_Pd_P', 'max_pyramidalization_P', 'max_quadrant_buried_volume_ligand_min', 'max_sasa_ligand_volume', 'max_solid_angle', 'max_tolman_electronic_parameter', 'min_P_int_ligand', 'min_P_int_phosphine', 'min_bond_order_Pd_P', 'min_buried_volume_ipso_4.5A', 'min_cone_angle', 'min_distance_Pd_N', 'min_global_electrophilicity', 'min_mlep_Pd_C', 'min_mlep_Pd_O', 'min_mlep_Pd_P', 'min_pyramidalization_P', 'min_quadrant_buried_volume_ligand_min', 'min_quadrant_total_volume_ligand_min', 'min_sasa_ligand_area', 'min_sasa_ligand_volume', 'min_solid_angle', 'mlep_Pd_N', 'pyramidalization_alpha', 'quadrant_buried_volume_ligand_mi

In [4]:
import arfs.feature_selection.allrelevant as arfsgroot
import numpy as np
from lightgbm import LGBMClassifier

selected_features = []

seed = 1
model = LGBMClassifier(random_state=seed, verbose=-1, class_weight="balanced")

feat_selector = arfsgroot.BoostAGroota(estimator=model, importance="shap")
feat_selector.fit(X, y_bin)

selected_features.append(feat_selector.selected_features_)

# feat_selector.plot_importance(n_feat_per_inch=3)
# plt.show()
print(list(selected_features))

BoostaGRoota round:   0%|          | 0/500 [00:00<?, ?it/s]

[array(['P_int_ipso_carbon', 'sterimol_ligand_L', 'homo_lumo',
       'partial_charge_phosphine', 'fukui_f_plus_aryl_carbon',
       'min_fukui_f_minus_avg_amine_proton',
       'min_tolman_electronic_parameter', 'max_buried_volume_3.5A',
       'max_buried_volume_5.0A', 'max_buried_volume_ipso_3.5A',
       'max_quadrant_total_volume_ligand_max', 'max_P_int_ligand',
       'max_sasa_ligand_area', 'max_homo_lumo',
       'max_partial_charge_amine_nitrogen',
       'max_fukui_f_plus_avg_amine_proton'], dtype=object)]
