## Feature selection

The following code is used to remove highly correlated features from the dataset. As well, ARFS (all relevant feature selection) is used to select the most important features.

In [14]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("../../data/ligand-qsar/raw/alkylamine-ligand-modeling-unprocessed.tsv", sep="\t")
# df = df[df["buchwald-type"] > 0]
# df = df[~df["ligand_1_name"].isin(["L-149", "L-150"])]

# Define bins and labels
bins = [-np.inf, 15, np.inf]
labels = [0, 1]
transformer = preprocessing.FunctionTransformer(pd.cut, kw_args={"bins": bins, "labels": labels, "retbins": False})

# Standardize features
scaler = StandardScaler()
X = df.drop(columns=["ligand_1_name", "product_1_yield", "buchwald-type"])
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Transform target
y = df["product_1_yield"]
y_bin = transformer.fit_transform(y)

In [15]:
import numpy as np


def calcDrop(res: pd.DataFrame) -> list[str]:
    """Calculate which columns to drop based on correlation matrix."""
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res["v1"].tolist() + res["v2"].tolist()))

    # All unique variables in drop column
    poss_drop = list(set(res["drop"].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))

    # Drop any variables in same row as a keep variable
    p = res[res["v1"].isin(keep) | res["v2"].isin(keep)][["v1", "v2"]]
    q = list(set(p["v1"].tolist() + p["v2"].tolist()))
    drop = list(set(q).difference(set(keep)))

    # Remove drop variables from possible drop
    poss_drop = list(set(poss_drop).difference(set(drop)))

    # subset res dataframe to include possible drop pairs
    m = res[res["v1"].isin(poss_drop) | res["v2"].isin(poss_drop)][["v1", "v2", "drop"]]

    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m["v1"].isin(drop) & ~m["v2"].isin(drop)]["drop"]))
    for item in more_drop:
        drop.append(item)

    return drop


def corrX_new(df: pd.DataFrame, cut: float = 0.9):
    """Calculate the correlation matrix and return the columns to drop."""
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis=1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(bool))

    dropcols = []

    res = []

    for row in range(len(up) - 1):
        col_idx = row + 1
        for col in range(col_idx, len(up)):
            if corr_mtx.iloc[row, col] > cut:
                if avg_corr.iloc[row] > avg_corr.iloc[col]:
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else:
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]

                s = [
                    corr_mtx.index[row],
                    up.columns[col],
                    avg_corr.iloc[row],
                    avg_corr.iloc[col],
                    up.iloc[row, col],
                    drop,
                ]

                res.append(s)

    res = pd.DataFrame(res, columns=(["v1", "v2", "v1.target", "v2.target", "corr", "drop"]))

    dropcols_names = calcDrop(res)

    return dropcols_names

In [16]:
run = True

if run:
    drop_cols = corrX_new(X, cut=0.95)

    X = X.drop(columns=drop_cols)

    print(sorted(drop_cols))
    print(len(drop_cols))

    df = df.drop(columns=drop_cols)
    df.to_csv("../../data/ligand-qsar/alkylamine-ligand-modeling.tsv", sep="\t", index=False)


['P_int_ipso_carbon', 'P_int_ligand', 'P_int_phosphine', 'buried_volume_ipso_2.5A', 'buried_volume_ipso_3.5A', 'buried_volume_ipso_4.5A', 'cone_angle', 'distance_Pd_P', 'global_electrophilicity', 'max_bond_order_Pd_P', 'max_buried_volume_ipso_4.5A', 'max_cone_angle', 'max_fukui_f_plus_aryl_carbon', 'max_global_nucleophilicity', 'max_mlep_Pd_C', 'max_mlep_Pd_N', 'max_mlep_Pd_O', 'max_mlep_Pd_P', 'max_pyramidalization_P', 'max_quadrant_buried_volume_ligand_min', 'max_quadrant_total_volume_ligand_range_max', 'max_sasa_ligand_volume', 'max_solid_angle', 'max_tolman_electronic_parameter', 'min_P_int_ligand', 'min_bond_order_Pd_P', 'min_buried_volume_3.5A', 'min_buried_volume_5.0A', 'min_buried_volume_ipso_4.5A', 'min_cone_angle', 'min_distance_Pd_N', 'min_global_electrophilicity', 'min_mlep_Pd_C', 'min_mlep_Pd_O', 'min_pyramidalization_P', 'min_quadrant_buried_volume_ligand_min', 'min_quadrant_total_volume_ligand_min', 'min_sasa_ligand_area', 'min_sasa_ligand_volume', 'min_solid_angle', 'ml