# Importing Libraries

In [None]:
# Data handling and Manipulation
import numpy as np
import pandas as pd

# Viusalization
import matplotlib.pyplot as plt
import seaborn as sns

# Performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import \
                accuracy_score, \
                classification_report, \
                cohen_kappa_score, \
                matthews_corrcoef, \
                confusion_matrix, \
                roc_auc_score

from sklearn import metrics
# Model Saving
import pickle,os

#  Machine Learning Algorithm libraries

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# Peformance Evalutation Function

In [None]:
def evaluate_performance(y_true,y_pred,pred_prob='NA'):
    cr = classification_report(y_true,y_pred)
    filtered_cr = [ line for line in cr.split('\n') if len(line)!=0]

    precision_0 = float(filtered_cr[1].split()[1])
    recall_0 = float(filtered_cr[1].split()[2])
    f1_0 = float(filtered_cr[1].split()[3])
    
    precision_1 = float(filtered_cr[2].split()[1])
    recall_1 = float(filtered_cr[2].split()[2])
    f1_1 = float(filtered_cr[2].split()[3])

    balanced_accuracy = (recall_0 + recall_1)/2
    acc_score = accuracy_score(y_true,y_pred)
    cohen_kappa = cohen_kappa_score(y_true,y_pred)
    matthews_corrcoef_score = matthews_corrcoef(y_true,y_pred)
    if pred_prob != 'NA':
        roc_auc = roc_auc_score(y_true,pred_prob)
    else:
        roc_auc = pred_prob

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    temp = dict(
        precision_0 = precision_0,
        precision_1 = precision_1,
        recall_0 = recall_0,
        recall_1 = recall_1,
        f1_0 = f1_0,
        f1_1 = f1_1,
        accuracy = acc_score,
        balanced_accuracy = balanced_accuracy,
        cohen_kappa = cohen_kappa,
        matthews_corrcoef_score = matthews_corrcoef_score,
        roc_auc_score = roc_auc,
        
        tn = tn,
        fp = fp,
        fn = fn,
        tp = tp
    )

    return temp

# GHOST Function

In [None]:
def optimize_threshold_train_subset(cls, fps_train, labels_train, thresholds, 
                                    ThOpt_metrics = 'Kappa', N_subsets = 100, 
                                    subsets_size = 0.2, with_replacement = False, random_seed = None):

    """Optimize the decision threshold based on subsets of the training set.
    The threshold that maximizes the Cohen's kappa coefficient or a ROC-based criterion 
    on the training subsets is chosen as optimal.
    
    Parameters
    ----------
    cls : obj
        Trained machine learning classifier built using scikit-learn
    fps_train: list 
        Molecular descriptors for the training set
    labels_train: list of int
        True labels for the training set
    thresholds: list of floats
        List of decision thresholds to screen for classification
    ThOpt_metrics: str
        Optimization metric. Choose between "Kappa" and "ROC"
    N_subsets: int
        Number of training subsets to use in the optimization
    subsets_size: float or int
        Size of the subsets. if float, represents the proportion of the dataset to include in the subsets. 
        If integer, it represents the actual number of instances to include in the subsets. 
    with_replacement: bool
        The subsets are drawn randomly. True to draw the subsets with replacement
    random_seed: int    
        random number to seed the drawing of the subsets
    
    Returns
    ----------
    thresh: float
        Optimal decision threshold for classification
    """
    
    # seeding
    np.random.seed(random_seed)
    random_seeds = np.random.randint(N_subsets*10, size=N_subsets)  
    
    # calculate prediction probability for the training set
    probs_train = cls.predict_proba(fps_train)[:,1]
    labels_train_thresh = {'labels': labels_train}
    labels_train_thresh.update({'probs': probs_train})
    # recalculate the predictions for the training set using different thresholds and
    # store the predictions in a dataframe
    for thresh in thresholds:
        labels_train_thresh.update({str(thresh): [1 if x >= thresh else 0 for x in probs_train]})
    df_preds = pd.DataFrame(labels_train_thresh)
    # Optmize the decision threshold based on the Cohen's Kappa coefficient
    if ThOpt_metrics == 'Kappa':
        # pick N_subsets training subsets and determine the threshold that provides the highest kappa on each 
        # of the subsets
        kappa_accum = []
        for i in range(N_subsets):
            if with_replacement:
                if isinstance(subsets_size, float):
                    Nsamples = int(df_preds.shape[0]*subsets_size)
                elif isinstance(subsets_size, int):
                    Nsamples = subsets_size                    
                df_subset = resample(df_preds, n_samples = Nsamples, stratify=list(df_preds.labels), random_state = random_seeds[i])
                labels_subset = df_subset['labels']
            else:
                df_tmp, df_subset, labels_tmp, labels_subset = train_test_split(df_preds, labels_train, test_size = subsets_size, stratify = labels_train, random_state = random_seeds[i])
            probs_subset = list(df_subset['probs'])
            thresh_names = [x for x in df_preds.columns if (x != 'labels' and x != 'probs')]
            kappa_train_subset = []
            for col1 in thresh_names:
                kappa_train_subset.append(metrics.cohen_kappa_score(labels_subset, list(df_subset[col1])))
            kappa_accum.append(kappa_train_subset)
        # determine the threshold that provides the best results on the training subsets
        y_values_median, y_values_std = helper_calc_median_std(kappa_accum)
        opt_thresh = thresholds[np.argmax(y_values_median)]
    # Optmize the decision threshold based on the ROC-curve, as described here https://doi.org/10.1007/s11548-013-0913-8
    elif ThOpt_metrics == 'ROC':
        sensitivity_accum = []
        specificity_accum = []
        # Calculate sensitivity and specificity for a range of thresholds and N_subsets
        for i in range(N_subsets):
            if with_replacement:
                if isinstance(subsets_size, float):
                    Nsamples = int(df_preds.shape[0]*subsets_size)
                elif isinstance(subsets_size, int):
                    Nsamples = subsets_size                    
                df_subset = resample(df_preds, n_samples = Nsamples, stratify=list(df_preds.labels), random_state = random_seeds[i])
                labels_subset = list(df_subset['labels'])
            else:
                df_tmp, df_subset, labels_tmp, labels_subset = train_test_split(df_preds, labels_train, test_size = subsets_size, stratify = labels_train, random_state = random_seeds[i])
            probs_subset = list(df_subset['probs'])
            sensitivity = []
            specificity = []
            for thresh in thresholds:
                scores = [1 if x >= thresh else 0 for x in probs_subset]
                tn, fp, fn, tp = metrics.confusion_matrix(labels_subset, scores, labels=list(set(labels_train))).ravel()
                sensitivity.append(tp/(tp+fn))
                specificity.append(tn/(tn+fp))
            sensitivity_accum.append(sensitivity)
            specificity_accum.append(specificity)
        # determine the threshold that provides the best results on the training subsets
        median_sensitivity, std_sensitivity = helper_calc_median_std(sensitivity_accum)
        median_specificity, std_specificity = helper_calc_median_std(specificity_accum)
        roc_dist_01corner = (2*median_sensitivity*median_specificity)/(median_sensitivity+median_specificity)
        opt_thresh = thresholds[np.argmax(roc_dist_01corner)]
    return opt_thresh


def helper_calc_median_std(specificity):
    # Calculate median and std of the columns of a pandas dataframe
    arr = np.array(specificity)
    y_values_median = np.median(arr,axis=0)
    y_values_std = np.std(arr,axis=0)
    return y_values_median, y_values_std    

# M-Tune Class

In [None]:
class mtune:
    def __init__(self, base_model, method='ensemble', n_ensemble=11,random_state = None):
        self.base_model = base_model
        self.method = method
        self.n_ensemble = n_ensemble
        self.threshold = None
        self.majority_class = None
        self.minority_class = None
        self.merged_data = None
        self.random_state = None
        self.thres_to_model_mapping = dict()
        
    def fit(self, X, y,verbose='off'):
        
        class_col_name = y.name
        val_counts = y.value_counts()
        
        self.majority_class = ( val_counts.idxmax(), val_counts.max() )
        self.minority_class = ( val_counts.idxmin(), val_counts.min() )  
        
        if self.method == 'direct':
            
            self.base_model.fit(X,y)
            pred_prob = self.base_model.predict_proba(X)
            pred_prob_minority = pred_prob[:,self.minority_class[0]]
            
            mean_value = np.mean(pred_prob_minority)
            self.threshold = mean_value
            if verbose == 'on':
                print('method =',self.method)
                print('threshold :',mean_value)
                
        elif self.method == 'ensemble':
            if verbose == 'on':
                print('method =',self.method)

            merged_data_df = pd.concat([X,y],axis=1)

            majority_class_df = merged_data_df[merged_data_df[class_col_name] == self.majority_class[0]]
            minority_class_df = merged_data_df[merged_data_df[class_col_name] == self.minority_class[0]]
            
            n_subsets = self.n_ensemble
            majority_class_df_split = np.array_split(majority_class_df, n_subsets)
            
            for i, majority_class_subset in enumerate(majority_class_df_split, 1):
                
                # ----- Pairing majority class subset with minority class ----- #
                paired_dataset = pd.concat( [ majority_class_subset, minority_class_df ], axis=0 )
                paired_dataset = paired_dataset.sample(frac=1, random_state=self.random_state) # shuffling
                # --------------------------------------------------------------- #
                # ----------- For training ensemble model ------------ #
                X_train = paired_dataset.drop(class_col_name, axis=1)
                y_train = paired_dataset[class_col_name]
                # -------------------------------------------------------- #
                # ------------------------- Model Training ---------------------- #
                self.base_model.fit(X_train, y_train)
                # ---------------------------------------------------------------- #
                # --- Prediction Probability Extraction for the trained data ---- #
                pred_prob = self.base_model.predict_proba(X_train)
                pred_prob_minority = pred_prob[:,self.minority_class[0]]
                # ---------------------------------------------------------------------------- #
                # ------------------------- Mean Value Calculation and mapping it to model ------------------------- # 
                mean_value = np.mean(pred_prob_minority)
                self.thres_to_model_mapping[mean_value] = self.base_model
            
        else :
            print('Choose a valid method')
            pass

    def predict(self,X):
        if self.method == 'direct':
            pred_prob_minority = self.base_model.predict_proba(X)[:,self.minority_class[0]].tolist()
            y_pred = [ self.minority_class[0] if p >= self.threshold else self.majority_class[0] for p in pred_prob_minority ]
            return y_pred
            
        elif self.method == 'ensemble':
            pred_df = pd.DataFrame() 
            for threshold, model in self.thres_to_model_mapping.items():
                pred_prob = model.predict_proba(X)
                
                pred_prob_minority = pred_prob[:,self.minority_class[0]].tolist()
                y_pred = [ self.minority_class[0] if p >= threshold else self.majority_class[0] for p in pred_prob_minority ]
                
                # pred_df[ str(threshold) + '_pred_prob'] = pred_prob_1
                pred_df[str(threshold) + '_pred_label'] = y_pred
                
            # --------------------------------------------- FINAL PREDICTION PROBABILITY CALCULATION --------------------------------------------- #
            # columns_to_check = [ c for c in list(pred_df.columns) if 'pred_prob' in c ]
            # pred_df['Final_Pred_Prob'] = pred_df[columns_to_check].mean(axis=1)
            # ------------------------------------------------------------------------------------------------------------------------------------ #
            # --------------------------------------------------- FINAL CLASS PREDICTION --------------------------------------------------- #
            columns_to_check = [ c for c in list(pred_df.columns) if 'pred_label' in c ]
            pred_df['Final Prediction'] =  pred_df[columns_to_check].apply(
                lambda row: self.minority_class[0] if (row == self.minority_class[0]).sum() > (row == self.majority_class[0]).sum() \
                                                else self.majority_class[0], axis=1)
            # ------------------------------------------------------------------------------------------------------------------------------ #
            # pred_df.name = None
            output = pred_df['Final Prediction']
            output.name = None
            
            return output
            
            pass
        else : 
            print('Please pass either "ensemble" or "direct" in "method" parameter ')
            pass
            

    def predict_proba(self,X):
        if self.method == 'direct':
            pred_probs = self.base_model.predict_proba(X)
            return pred_probs
        elif self.method == 'ensemble':
            pred_df = pd.DataFrame() 
            for threshold, model in self.thres_to_model_mapping.items():
                pred_prob = model.predict_proba(X)
                
                pred_prob_minority = model.predict_proba(X)[:, self.minority_class[0]].tolist()
                pred_prob_majority = model.predict_proba(X)[:, self.majority_class[0]].tolist()
                
                pred_df[ str(threshold) + '_pred_prob_minority'] = pred_prob_minority
                pred_df[ str(threshold) + '_pred_prob_majority'] = pred_prob_majority
                
            # --------- FINAL PREDICTION PROBABILITY CALCULATION --------- #
            if self.minority_class[0] < self.majority_class[0]:
                columns_to_check = [ c for c in list(pred_df.columns) if '_pred_prob_minority' in c ]
                pred_df['Final_Pred_Prob_minority'] = pred_df[columns_to_check].mean(axis=1)

                columns_to_check = [ c for c in list(pred_df.columns) if '_pred_prob_majority' in c ]
                pred_df['Final_Pred_Prob_majority'] = pred_df[columns_to_check].mean(axis=1) 

                return pred_df[[ 'Final_Pred_Prob_minority', 'Final_Pred_Prob_majority' ]].to_numpy()
            
            else:
                columns_to_check = [ c for c in list(pred_df.columns) if '_pred_prob_majority' in c ]
                pred_df['Final_Pred_Prob_majority'] = pred_df[columns_to_check].mean(axis=1)         
                
                columns_to_check = [ c for c in list(pred_df.columns) if '_pred_prob_minority' in c ]
                pred_df['Final_Pred_Prob_minority'] = pred_df[columns_to_check].mean(axis=1)
            
                return pred_df[[ 'Final_Pred_Prob_majority', 'Final_Pred_Prob_minority' ]].to_numpy()
        else :
            print('Please pass either "ensemble" or "direct" in "method" parameter ')
            pass

# Initialising Classifiers

In [None]:
seed = 42

## XGBoost

In [None]:
# Dictionary of parameters
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'n_estimators': 100,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective': 'binary:logistic',
    'eval_metric': None,
    'random_state': seed,
    'use_label_encoder': True
}
xgb_model = XGBClassifier(**xgb_params)
#xgb_model.get_params()

## CatBoost

In [None]:
cat_params = dict(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    random_state=seed,
    loss_function='Logloss',
    custom_metric=None,
    eval_metric=None,
    logging_level='Silent',
    random_strength=1,
    bagging_temperature=1,
    od_type='Iter',
    od_wait=100,
    allow_writing_files=True
)

cat_model = CatBoostClassifier(**cat_params)
#cat_model.get_params()

# Useful Functions

### Standard

In [None]:
def perform_standard(classifier,classifier_name,X_train,y_train,performance_df,pkl_dir,pkl_prefix):
    # dataframe to store prediction results
    train_pred_df = pd.DataFrame()
    test_pred_df = pd.DataFrame()    
    # ------------------------------------- Standard threshold ------------------------------------- #

    classifier.fit(X_train,y_train)
    
    # ----------------- Training Evaluation ------------------- #
    
    train_pred_df = pd.DataFrame()
    train_pred_df['y_true'] = y_train.tolist()
    
    # predict the training 
    test_probs = classifier.predict_proba(X_train)[:,1].tolist() #prediction probabilities for the test set
    train_pred_df['pred_prob'] = test_probs
    
    scores = [1 if x>=0.5 else 0 for x in test_probs]
    train_pred_df[f'{classifier_name}_standard_pred'] = scores
    
    performance = dict(dataset = 'training',method = f'{classifier_name}_standard')

    train_class_counts = y_train.value_counts()
    classes = list(train_class_counts.index)
    temp = { idx : train_class_counts[idx] for idx in classes }
    performance.update(temp)
    
    temp = dict(threshold = 0.5 )
    performance.update(temp)
    
    y_true = train_pred_df['y_true'].tolist()
    y_pred = train_pred_df[f'{classifier_name}_standard_pred'].tolist()
    pred_prob = train_pred_df['pred_prob'].tolist()
    
    
    temp = evaluate_performance(
    y_true = y_true, 
    y_pred = y_pred,
    pred_prob = pred_prob
    )
    
    performance.update(temp)
    performance_df[performance_df.shape[1]] = performance
    
    # ----------------- Testing Evaluation ------------------- #
    
    test_pred_df = pd.DataFrame()
    test_pred_df['y_true'] = y_test.tolist()
    
    # predict the testing 
    test_probs = classifier.predict_proba(X_test)[:,1].tolist() #prediction probabilities for the test set
    test_pred_df['pred_prob'] = test_probs
    
    scores = [1 if x>=0.5 else 0 for x in test_probs]
    test_pred_df[f'{classifier_name}_standard_pred'] = scores
    
    performance = dict(dataset = 'testing', method = f'{classifier_name}_standard')
    
    test_class_counts = y_test.value_counts()
    classes = list(test_class_counts.index)
    temp = { idx : test_class_counts[idx] for idx in classes }
    performance.update(temp)
    
    temp = dict(threshold = 0.5 )
    performance.update(temp)
    
    y_true = test_pred_df['y_true'].tolist()
    y_pred = test_pred_df[f'{classifier_name}_standard_pred'].tolist()
    pred_prob = test_pred_df['pred_prob'].tolist()
    
    
    temp = evaluate_performance(
    y_true = y_true, 
    y_pred = y_pred,
    pred_prob = pred_prob
    )
    
    performance.update(temp)
    performance_df[performance_df.shape[1]] = performance

    pkl_path = os.path.join(pkl_dir,f'{pkl_prefix}_{classifier_name}_standard.pkl')
    # Serialize the object and save it to a file
    with open(pkl_path, 'wb') as file:
        pickle.dump(classifier, file)

    return classifier, performance_df

### GHOST

In [None]:
def perform_ghost(classifier,classifier_name,X_train,y_train,performance_df,pkl_dir,pkl_prefix):
    # dataframe to store prediction results
    train_pred_df = pd.DataFrame()
    test_pred_df = pd.DataFrame() 
    
    # ------------------------------------------ GHOST ------------------------------------------------------#
    # ---parameters for threshold optimization - we use default values for most parameters
    thresholds = np.round(np.arange(0.05,0.55,0.05),2)
    random_seed = 42
    
    #these are default:
    ThOpt_metrics = 'Kappa'
    N_subsets = 100
    subsets_size = 0.2
    with_replacement = False

    # Can be used for every machine learning model
    thresh_sub = optimize_threshold_train_subset(cls=classifier, fps_train=X_train, labels_train=y_train, thresholds=thresholds,
                                                              ThOpt_metrics = ThOpt_metrics, 
                                                              N_subsets = N_subsets, subsets_size = subsets_size, 
                                                              with_replacement = with_replacement, random_seed = random_seed)

    # ----------------------------------------------------------------------------------------------------------------------- #
    # ------------------------------------- ghost threshold----------------------------------#
    # ---------------- training evaluation ---------------------#
    train_pred_df['y_true'] = y_train.tolist()
    pred_probs = classifier.predict_proba(X_train)[:,1].tolist()
    train_pred_df['pred_prob'] = pred_probs
    
    scores = [1 if x>= thresh_sub else 0 for x in train_pred_df['pred_prob'].tolist()]
    train_pred_df[f'{classifier_name}_ghost_pred'] = scores
    
    performance = dict(dataset = 'training',method = f'{classifier_name}_ghost')
    
    train_class_counts = y_train.value_counts()
    classes = list(train_class_counts.index)
    temp = { idx : train_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = thresh_sub)
    performance.update(temp)
    
    y_true = train_pred_df['y_true'].tolist()
    y_pred = train_pred_df[f'{classifier_name}_ghost_pred'].tolist()
    pred_prob = train_pred_df['pred_prob'].tolist()

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )

    performance.update(temp)

    performance_df[performance_df.shape[1]] = performance

    # -------------------------------------- ghost threshold -------------------------------------- #
    # -------------- testing evaluation ------------------------- #
    test_pred_df['y_true'] = y_test.tolist()
    pred_probs = classifier.predict_proba(X_test)[:,1].tolist()
    test_pred_df['pred_prob'] = pred_probs
    
    scores = [1 if x>= thresh_sub else 0 for x in test_pred_df['pred_prob'].tolist()]
    test_pred_df[f'{classifier_name}_ghost_pred'] = scores
    
    performance = dict(dataset = 'testing',method = f'{classifier_name}_ghost')
    
    test_class_counts = y_test.value_counts()
    classes = list(test_class_counts.index)
    
    temp = { idx : test_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = thresh_sub)
    performance.update(temp)

    y_true = test_pred_df['y_true'].tolist()
    y_pred = test_pred_df[f'{classifier_name}_ghost_pred'].tolist()
    pred_prob = test_pred_df['pred_prob'].tolist()
    

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )
    performance.update(temp)
    performance_df[performance_df.shape[1]] = performance

    pkl_path = os.path.join(pkl_dir,f'{pkl_prefix}_{classifier_name}_ghost_{thresh_sub}.pkl')
    # Serialize the object and save it to a file
    with open(pkl_path, 'wb') as file:
        pickle.dump(classifier, file)

    return performance_df




### Direct Mtune

In [None]:
def perform_direct_mtune(classifier,classifier_name,X_train,y_train,performance_df,pkl_dir,pkl_prefix):
    train_pred_df = pd.DataFrame()
    test_pred_df = pd.DataFrame() 
    
    direct_mtune_model = mtune( classifier, method='direct',random_state = 42)
    direct_mtune_model.fit(X_train,y_train)

    y_pred = direct_mtune_model.predict(X_train)
    pred_prob = direct_mtune_model.predict_proba(X_train)
    pred_prob_1 = pred_prob[:,1].tolist()

    train_pred_df['y_true'] = y_train.tolist()
    train_pred_df['pred_prob'] = pred_prob_1
    train_pred_df[f'{classifier_name}_direct_mtune_pred'] = y_pred

    opt_thres = direct_mtune_model.threshold
    
    performance = dict(dataset = 'training',method =f'{classifier_name}_direct_mtune')
    
    train_class_counts = y_train.value_counts()
    classes = list(train_class_counts.index)
    temp = { idx : train_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = opt_thres)
    performance.update(temp)
    
    y_true = train_pred_df['y_true'].tolist()
    y_pred = train_pred_df[f'{classifier_name}_direct_mtune_pred'].tolist()
    pred_prob = train_pred_df['pred_prob'].tolist()

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )

    performance.update(temp)

    performance_df[performance_df.shape[1]] = performance    


    # Testing 

    y_pred = direct_mtune_model.predict(X_test)
    pred_prob = direct_mtune_model.predict_proba(X_test)
    pred_prob_1 = pred_prob[:,1].tolist()

    test_pred_df['y_true'] = y_test.tolist()
    test_pred_df['pred_prob'] = pred_prob_1
    test_pred_df[f'{classifier_name}_direct_mtune_pred'] = y_pred
    
    
    performance = dict(dataset = 'testing',method = f'{classifier_name}_direct_mtune')
    
    test_class_counts = y_test.value_counts()
    classes = list(test_class_counts.index)
    
    temp = { idx : test_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = opt_thres)
    performance.update(temp)

    y_true = test_pred_df['y_true'].tolist()
    y_pred = test_pred_df[f'{classifier_name}_direct_mtune_pred'].tolist()
    pred_prob = test_pred_df['pred_prob'].tolist()

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )
    performance.update(temp)
    performance_df[performance_df.shape[1]] = performance

    pkl_path = os.path.join(pkl_dir,f'{pkl_prefix}_{classifier_name}_direct_mtune_{opt_thres}.pkl')
    # Serialize the object and save it to a file
    with open(pkl_path, 'wb') as file:
        pickle.dump(direct_mtune_model, file)

    return performance_df

### Ensemble M-Tune

In [None]:
def perform_ensemble_mtune(classifier,classifier_name,X_train,y_train,performance_df,pkl_dir,pkl_prefix):
    train_pred_df = pd.DataFrame()
    test_pred_df = pd.DataFrame() 
    
    ensemble_mtune_model = mtune( classifier, method='ensemble',random_state = 42)
    ensemble_mtune_model.fit(X_train,y_train)

    y_pred = ensemble_mtune_model.predict(X_train)
    pred_prob = ensemble_mtune_model.predict_proba(X_train)
    pred_prob_1 = pred_prob[:,1].tolist()

    train_pred_df['y_true'] = y_train.tolist()
    train_pred_df['pred_prob'] = pred_prob_1
    train_pred_df[f'{classifier_name}_ensemble_mtune_pred'] = y_pred.tolist()
    
    performance = dict(dataset = 'training',method =f'{classifier_name}_ensemble_mtune')
    
    train_class_counts = y_train.value_counts()
    classes = list(train_class_counts.index)
    temp = { idx : train_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = 'mean')
    performance.update(temp)
    
    y_true = train_pred_df['y_true'].tolist()
    y_pred = train_pred_df[f'{classifier_name}_ensemble_mtune_pred'].tolist()
    pred_prob = train_pred_df['pred_prob'].tolist()

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )

    performance.update(temp)

    performance_df[performance_df.shape[1]] = performance    


    # Testing 

    y_pred = ensemble_mtune_model.predict(X_test)
    pred_prob = ensemble_mtune_model.predict_proba(X_test)
    pred_prob_1 = pred_prob[:,1].tolist()

    test_pred_df['y_true'] = y_test.tolist()
    test_pred_df['pred_prob'] = pred_prob_1
    test_pred_df[f'{classifier_name}_ensemble_mtune_pred'] = y_pred.tolist()
    
    performance = dict(dataset = 'testing',method = f'{classifier_name}_ensemble_mtune')
    
    test_class_counts = y_test.value_counts()
    classes = list(test_class_counts.index)
    
    temp = { idx : test_class_counts[idx] for idx in classes }
    performance.update(temp)

    temp = dict(threshold = 'mean')
    performance.update(temp)

    y_true = test_pred_df['y_true'].tolist()
    y_pred = test_pred_df[f'{classifier_name}_ensemble_mtune_pred'].tolist()
    pred_prob = test_pred_df['pred_prob'].tolist()

    temp = evaluate_performance(
        y_true = y_true, 
        y_pred = y_pred,
        pred_prob = pred_prob
    )
    performance.update(temp)
    performance_df[performance_df.shape[1]] = performance
    
    pkl_path = os.path.join(pkl_dir,f'{pkl_prefix}_{classifier_name}_ensemble_mtune.pkl')
    # Serialize the object and save it to a file
    with open(pkl_path, 'wb') as file:
        pickle.dump(ensemble_mtune_model, file)
        
    return performance_df

## Function to run all Methods

In [None]:
def implement_all_methods(classifier, classifier_name, X_train, y_train, performance_df, pkl_dir, pkl_prefix):
    classifier, performance_df = perform_standard(classifier,classifier_name,X_train,y_train,performance_df, pkl_dir, pkl_prefix)
    performance_df = perform_ghost(classifier,classifier_name,X_train,y_train,performance_df, pkl_dir, pkl_prefix)
    performance_df = perform_direct_mtune(classifier,classifier_name,X_train,y_train,performance_df, pkl_dir, pkl_prefix)
    performance_df = perform_ensemble_mtune(classifier,classifier_name,X_train,y_train,performance_df, pkl_dir, pkl_prefix)

    return performance_df

# Setting Directory

In [None]:
cwd = os.getcwd()

In [None]:
performance_dir = os.path.join(cwd,'PERFORMANCE')
os.makedirs( performance_dir , exist_ok=True )

pkl_dir = os.path.join(performance_dir,'pkl_models')
os.makedirs( pkl_dir , exist_ok=True )

Fingerprints for each cell line has to be organized in following order  

cell_line_dir  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BJeLR  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; AtomPairs2D.csv  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; SubstructureCount    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;...  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;TGF-B  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; AtomPairs2D.csv  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; SubstructureCount  

In [None]:
cell_line_dir = 'path/to/cell_line_dir'

cell_lines = [ dir for dir in os.listdir(cell_line_dir) if \
     os.path.isdir( os.path.join(cell_line_dir,dir) )
]
cell_lines

# Training

In [None]:
performance_df = pd.DataFrame()

for cell_line in cell_lines:
    print(cell_line)
    cell_line_fp_dir = os.path.join(cell_line_dir,cell_line)
    for fp in os.listdir(cell_line_fp_dir):
        print('\t',fp)
        processed_fp_name = fp.split('.')[0]
        processed_fp_name = processed_fp_name.split('_')[0]
        
        fp_path = os.path.join(cell_line_fp_dir,fp)
        
        fp_df = pd.read_csv(fp_path)
        fp_df = fp_df.rename(columns = {'Activity':'Name'} )

        fp_df = fp_df.sample(frac=1, random_state = seed )
        train_df,test_df = train_test_split(fp_df, test_size = 0.2, stratify = fp_df['Name'], random_state = seed )
        
        X_train = train_df.drop('Name', axis=1)
        y_train = train_df['Name']
    
        X_test = test_df.drop('Name', axis=1)
        y_test = test_df['Name']

        xgb_model = XGBClassifier(**xgb_params)
        cat_model = CatBoostClassifier(**cat_params)

        temp = pd.DataFrame()


        pkl_prefix = f'{cell_line}_{processed_fp_name}'
        implement_all_methods(xgb_model,'xgboost',X_train,y_train,temp, pkl_dir,pkl_prefix)
        implement_all_methods(cat_model,'catboost',X_train,y_train,temp, pkl_dir,pkl_prefix)
        
        temp.loc['cell_line'] = cell_line
        temp.loc['finger_print'] = processed_fp_name

        performance_df = pd.concat([ performance_df , temp ], axis = 1 )

performance_df = performance_df.transpose()

In [None]:
performance_df

In [None]:
op_path = os.path.join(performance_dir,'CELL_LINE_PERFORMANCES_MTUNE.xlsx')
performance_df.to_excel(op_path)