In [None]:
#iport external data
import pandas as pd
import numpy as np
from rdkit.Chem import MACCSkeys
from rdkit import Chem, DataStructs
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix

In [None]:
#in-house functions
def rdkit_numpy_convert(fp):
    """Convert rdkit mol to numpy array"""
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [None]:
#import data
fname = 'D:\\modls\\data_unseen_dataset_natura.csv'
externalset = pd.read_csv(fname)

In [None]:
#import model
with open(f"D:\\modls\\model_binary_lgbm_maccs_fp.joblib", 'rb') as f:
        opt_model = joblib.load(f)

#calculate fp
mols = [Chem.MolFromSmiles(smile) for smile in externalset['SMILES']]
fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]

#define x_ext and y_ext
x_ext = rdkit_numpy_convert(fps)
y_ext = [1 if activity.lower() == 'positive' else 0 for activity in externalset['Ames Global']]

#predict
probs_external = opt_model.predict(x_ext)

#add the predicted value to original dataset
predict_y = pd.DataFrame([[i] for i in np.array(probs_external)])
externalset['original_y'] = y_ext
externalset['predict_y'] = predict_y

#calculate confusion matrix
def confusion_matrix_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return {'tn': cm[0, 0], 'fp': cm[0, 1],
    'fn': cm[1, 0], 'tp': cm[1, 1]}

confus_matrix = cross_validate(opt_model, predict_y, y_ext, cv=5, scoring=confusion_matrix_scorer)
tn = pd.DataFrame([[i] for i in np.array(confus_matrix['test_tn'])])
tn.rename(columns = {0:'fold_1'}, inplace = True)

fp = pd.DataFrame([[i] for i in np.array(confus_matrix['test_fp'])])
fp.rename(columns = {0:'fold_1'}, inplace = True)

fn = pd.DataFrame([[i] for i in np.array(confus_matrix['test_fn'])])
fn.rename(columns = {0:'fold_1'}, inplace = True)

tp = pd.DataFrame([[i] for i in np.array(confus_matrix['test_tp'])])
tp.rename(columns = {0:'fold_1'}, inplace = True)

confusion = [tn, fp, fn, tp]
confusion_final = pd.concat(confusion, axis=1)

#save
externalset.to_csv(f'D:\\modls\\unseen_data_with_ypred.csv')
confusion_final.to_csv(f'D:\\modls\\unseen_data_confusionmatrix_with_ypred.csv')
