## Logistic Regression Classifier

This notebook demonstrates how we screened all possible combinations of the descriptors to develop a logistic regression model for the classification of ligand activity. 

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("data/ligand-qsar/alkylamine-ligand-modeling.tsv", sep="\t")
df = df[df["buchwald-type"] > 0]

# Define bins and labels
activity_cutoff = 15
bins = [-np.inf, activity_cutoff, np.inf]
labels = ["Low", "High"]
transformer = preprocessing.FunctionTransformer(
    pd.cut, kw_args={"bins": bins, "labels": labels, "retbins": False}
)

# Standardize features
scaler = StandardScaler()
X = df.drop(columns=["ligand_1_name", "product_1_yield"])
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Transform target
y = df["product_1_yield"]
y_bin = transformer.fit_transform(y)

In [3]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold


def evaluate_model(clf, X, y_bin):
    """Evaluate a model using cross-validation."""
    clf.fit(X, y_bin)
    print(X.columns)
    preds = clf.predict(X)
    print(f"Accuracy: {metrics.accuracy_score(y_bin, preds):0.2f}")
    print(f"F1: {metrics.f1_score(y_bin, preds, pos_label='High'):0.2f}")
    print(f"MCC: {metrics.matthews_corrcoef(y_bin, preds):0.2f}")
    print("Confusion matrix:\n", metrics.confusion_matrix(y_bin, preds))
    print(f"Incorrect ligands: {df.loc[y_bin != preds, 'ligand_1_name'].tolist()}")
    print()

    kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    kfold_accuracies = []
    kfold_high_yield_f1s = []
    kfold_mccs = []
    for train_index, test_index in kfold.split(X, y_bin):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y_bin[train_index], y_bin[test_index]

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

        accuracy = metrics.accuracy_score(y_test, preds)
        high_yield_f1 = metrics.f1_score(y_test, preds, pos_label="High")
        mcc = metrics.matthews_corrcoef(y_test, preds)

        kfold_accuracies.append(accuracy)
        kfold_high_yield_f1s.append(high_yield_f1)
        kfold_mccs.append(mcc)

    print(
        f"K-Fold Accuracy: {np.mean(kfold_accuracies):0.2f} +/- {np.std(kfold_accuracies):0.2f}"
    )
    print(
        f"K-Fold F1: {np.mean(kfold_high_yield_f1s):0.2f} +/- {np.std(kfold_high_yield_f1s):0.2f}"
    )
    print(f"K-Fold MCC: {np.mean(kfold_mccs):0.2f} +/- {np.std(kfold_mccs):0.2f}")
    print()

In [2]:
from itertools import combinations

from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

run = True


def evaluate_combination(features: tuple):
    """Evaluate a combination of features using multiprocessing."""
    clf = LogisticRegression(random_state=42, class_weight="balanced", C=6)
    clf.fit(X[list(features)], y_bin)
    f1 = round(
        metrics.f1_score(y_bin, clf.predict(X[list(features)]), pos_label="High"), 2
    )
    if f1 < 0.7:
        return (features, f1, 0)
    kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    kfold_f1s = []
    for train_index, test_index in kfold.split(X, y_bin):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y_bin[train_index], y_bin[test_index]
        clf.fit(X_train[list(features)], y_train)
        preds = clf.predict(X_test[list(features)])
        f1_k = metrics.f1_score(y_test, preds, pos_label="High")
        kfold_f1s.append(f1_k)
    kfold_f1 = round(np.mean(kfold_f1s), 2)
    return (features, f1, kfold_f1)


Evaluate all features as a single parameter logistic regression

In [None]:
combinations_list = list(combinations(X.columns, 1))
print(f"Number of combinations: {len(combinations_list)}")

if run:
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_combination)(features) for features in tqdm(combinations_list)
    )

    results = sorted(results, key=lambda x: x[1], reverse=True)
    for features, f1, kfold_f1 in results[:10]:
        print(f"Features: {features}, F1: {f1}, K-Fold F1: {kfold_f1}")

In [20]:
X_feat = X[list(results[0][0])]

clf = LogisticRegression(random_state=42, class_weight="balanced", C=10)
evaluate_model(clf, X_feat, y_bin)

print("Feature coefficients:")
coefficients = sorted(
    zip(X_feat.columns, clf.coef_[0]), key=lambda x: abs(x[1]), reverse=True
)
for feature, coefficient in coefficients:
    print(f"  {feature}: {coefficient:0.2f}")

print("Intercept:", clf.intercept_[0])

Index(['max_sasa_ligand_area'], dtype='object')
Accuracy: 0.93
F1: 0.70
MCC: 0.68
Confusion matrix:
 [[ 8  1]
 [ 6 81]]
Incorrect ligands: ['L-067', 'L-106', 'L-107', 'L-108', 'L-109', 'L-128', 'L-147']

K-Fold Accuracy: 0.93 +/- 0.06
K-Fold F1: 0.74 +/- 0.18
K-Fold MCC: 0.72 +/- 0.21

Feature coefficients:
  max_sasa_ligand_area: -3.93
Intercept: 3.8428755503895218


Evaluate all features as a two parameter logistic regression

In [22]:
from itertools import combinations

from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

run = True

combinations_list = list(combinations(X.columns, 2))
print(f"Number of combinations: {len(combinations_list)}")

if run:
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_combination)(features) for features in tqdm(combinations_list)
    )

    results = sorted(results, key=lambda x: x[1], reverse=True)
    for features, f1, kfold_f1 in results[:10]:
        print(f"Features: {features}, F1: {f1}, K-Fold F1: {kfold_f1}")

Number of combinations: 11628


100%|██████████| 11628/11628 [00:10<00:00, 1155.77it/s]


Features: ('sterimol_ligand_L', 'max_sasa_ligand_area'), F1: 0.86, K-Fold F1: 0.82
Features: ('quadrant_buried_volume_ligand_range_max', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.75
Features: ('homo_lumo', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.84
Features: ('partial_charge_carbonyl_carbon', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.84
Features: ('fukui_f_minus_aryl_carbon', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.81
Features: ('min_quadrant_buried_volume_ligand_range_max', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.83
Features: ('min_fukui_f_minus_avg_amine_proton', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.86
Features: ('max_P_int_ligand', 'max_sasa_ligand_area'), F1: 0.82, K-Fold F1: 0.83
Features: ('max_sasa_ligand_area', 'max_sterimol_ligand_L'), F1: 0.82, K-Fold F1: 0.87
Features: ('max_sasa_ligand_area', 'max_distance_carb_OH_O'), F1: 0.82, K-Fold F1: 0.72


In [23]:
X_feat = X[list(results[0][0])]

clf = LogisticRegression(random_state=42, class_weight="balanced", C=10)
evaluate_model(clf, X_feat, y_bin)

print("Feature coefficients:")
coefficients = sorted(
    zip(X_feat.columns, clf.coef_[0]), key=lambda x: abs(x[1]), reverse=True
)
for feature, coefficient in coefficients:
    print(f"  {feature}: {coefficient:0.2f}")

print("Intercept:", clf.intercept_[0])

Index(['sterimol_ligand_L', 'max_sasa_ligand_area'], dtype='object')
Accuracy: 0.97
F1: 0.86
MCC: 0.85
Confusion matrix:
 [[ 9  0]
 [ 3 84]]
Incorrect ligands: ['L-067', 'L-106', 'L-107']

K-Fold Accuracy: 0.96 +/- 0.03
K-Fold F1: 0.82 +/- 0.12
K-Fold MCC: 0.82 +/- 0.12

Feature coefficients:
  max_sasa_ligand_area: -3.74
  sterimol_ligand_L: -0.63
Intercept: 4.397759060313071


Evaluate all features as a three parameter logistic regression

In [4]:
from itertools import combinations

from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

run = True

combinations_list = list(combinations(X.columns, 3))
print(f"Number of combinations: {len(combinations_list)}")

if run:
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_combination)(features) for features in tqdm(combinations_list)
    )

    results = sorted(results, key=lambda x: x[1], reverse=True)
    for features, f1, kfold_f1 in results[:10]:
        print(f"Features: {features}, F1: {f1}, K-Fold F1: {kfold_f1}")

Number of combinations: 585276


100%|██████████| 585276/585276 [09:09<00:00, 1064.54it/s]


Features: ('min_distance_Pd_P', 'max_sasa_ligand_area', 'max_sterimol_ligand_B5'), F1: 0.95, K-Fold F1: 0.83
Features: ('buried_volume_3.5A', 'quadrant_buried_volume_ligand_range_max', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.87
Features: ('quadrant_buried_volume_ligand_range_max', 'max_buried_volume_3.5A', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.87
Features: ('sterimol_ligand_L', 'max_P_int_ligand', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.83
Features: ('buried_sterimol_ligand_L', 'min_fukui_f_minus_avg_amine_proton', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.78
Features: ('buried_sterimol_ligand_B5', 'bond_order_Pd_P', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.89
Features: ('distance_Pd_N', 'max_buried_volume_5.0A', 'max_buried_volume_ipso_3.5A'), F1: 0.9, K-Fold F1: 0.59
Features: ('distance_avg_amine_N_H', 'min_fukui_f_minus_avg_amine_proton', 'max_sasa_ligand_area'), F1: 0.9, K-Fold F1: 0.89
Features: ('homo_lumo', 'min_fukui_f_minus_avg_amine_proton',

In [5]:
X_feat = X[list(results[0][0])]

clf = LogisticRegression(random_state=42, class_weight="balanced", C=10)
evaluate_model(clf, X_feat, y_bin)

print("Feature coefficients:")
coefficients = sorted(
    zip(X_feat.columns, clf.coef_[0]), key=lambda x: abs(x[1]), reverse=True
)
for feature, coefficient in coefficients:
    print(f"  {feature}: {coefficient:0.2f}")

print("Intercept:", clf.intercept_[0])

Index(['min_distance_Pd_P', 'max_sasa_ligand_area', 'max_sterimol_ligand_B5'], dtype='object')
Accuracy: 0.99
F1: 0.95
MCC: 0.94
Confusion matrix:
 [[ 9  0]
 [ 1 86]]
Incorrect ligands: ['L-107']

K-Fold Accuracy: 0.97 +/- 0.03
K-Fold F1: 0.83 +/- 0.17
K-Fold MCC: 0.84 +/- 0.16

Feature coefficients:
  max_sasa_ligand_area: -5.00
  min_distance_Pd_P: 1.56
  max_sterimol_ligand_B5: 0.91
Intercept: 5.0962473425312576
