# Notebook for HPV Triplicate Study

####################################################### <br>
    <font color='maroon'>
    Updated on Nov. 21, 2023 <br>
    <p><b>Author</b>:<br>
    Sambit K. Mishra <br> 
    Cancer Genomics Research Laboratory <br>
    Frederick National Laboratory <br>
    Division of Cancer Epidemiology and Genetics <br>
    National Cancer Institute </b>
    <font color='black'> <br>
####################################################### <br>

This notebook includes the steps involving the training and testing of <b> extreme gradient boosting (XGBoost) </b> machine learning models for predicting true low/intermediate-VAF <b> intrahost single nucleotide variants (iSNVs) </b> in HPV. The splitting of the datasets into training and testing is performed at the sample-level instead of the iSNV level. The data files used for model training and testing in this notebook were generated using an in-house variant calling pipeline and a set of in-house scripts that processed the VCF files and parsed the variants, their replicate frequency and all the 31 features into a csv file. The notebook is indended to serve as reference for readers to understand how XGBoost model training and testing was performed in our study and more importantly, for reproducibility of the study results.

We have included the training and testing steps for the 3 scenarios described in our study: <p>
> <p> <b> 1. Machine learning with VAF filters (FM models) </b>
> <p> <b> 2. Macine learning with VCFgenie without any low-VAF filters (VM models) </b> <p>
> <p> <b> 3. Machine learning with VCFgenie with low-VAF filters (FVM models) </b> <p>

## Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import math
import os
import matplotlib.pyplot as plt
import glob
import random
from scipy import stats
from pathlib import Path
import warnings
from itertools import product
import joblib
warnings.filterwarnings("ignore") # Will suppress any unnecessary warnings

# ML libraries
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef
from sklearn.metrics import mean_squared_error, cohen_kappa_score, make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score, ConfusionMatrixDisplay

## Define some custom functions

In [2]:
 # Plot settings
sns.set_style('ticks')

def remove_low_coverage_samples(df_snv_w_cov, min_cov_cutoff, coverage_column):
    """
    Updated to exclude minimum depth requirement.
    Remove iSNVs that do not meet quality control criteria: FAO > 0
        df_snv_w_cov: The dataframe with all SNVs and their features
        min_cov_cutoff: The minimum value for coverage
        coverage_column: The coverage column (FDP or FAO for Ion Torrent data)
    """
    df_snv_high_cov = df_snv_w_cov.copy()
    df_snv_high_cov = df_snv_high_cov.loc[df_snv_high_cov[coverage_column] >= min_cov_cutoff]
    df_snv_high_cov.reset_index(drop=True, inplace=True)
    return df_snv_high_cov
    
def create_balanced_datasets(X_train, y_train, true_label=1, false_label=0):
    """
    Create balanced datasets for training. The test data will remain imbalanced.
        seed_value : The seed for random state selection
        t_size : Size of the test data. Note that this is 30% of the entire data 
                (true label + false label), not 30% of each label. You may notice that 
                the test data for the individual labels (true label or false label) are not exactly 
                30% of the entire data. 
        true_label : The numeric label for the true variants
        false_label: The numeric label for the false variants
    """
    train_samples = {} # Dictionary in which we will store the training samples
    train_index_list = [] # Will keep track of the indices of the sampled rows in the training data.
                          # Depending on the value set for alpha, the number of unique indices
                          # should equal to the size of the training data.
    X_train.reset_index(inplace=True, drop=True)
    
    # If the training data is not balanced, then undersample the over-represented class
    # and create multiple training datasets.
    num_true = y_train.count(true_label) # Number of true labels
    num_false = y_train.count(false_label) # Number of false labels
    
    # Identify the over-represented class 
    if num_true > num_false:
        over_rep_class = true_label
        over_rep_count = num_true
        under_rep_class = false_label
        under_rep_count = num_false
    elif num_false > num_true:
        over_rep_class = false_label
        over_rep_count = num_false
        under_rep_class = true_label
        under_rep_count = num_true
    else:
        # Both classes are balanced
        train_samples['1'] = {}
        train_samples['1']['X_train'] = X_train
        train_samples['1']['y_train'] = y_train
        return train_samples, train_index_list
    
    # # Commenting out this debug info statement for now. Uncomment when needed.
    print ("\tTrain-test split resulted in imbalanced training data", flush=True)
    print (f"\tOver-represented class = {over_rep_class}, count = {over_rep_count}", flush=True)
    print (f"\tUnder-represented class = {under_rep_class}, count = {under_rep_count}", flush=True)
    print ("\tWill proceed by creating balanced training data.", flush=True)
    
    alpha = 5 # A sampling weight constant that determines the number of times
            # the over-represented class will be sampled.
    num_sample_iter = int(round(over_rep_count/under_rep_count)*alpha)
    sample_size = under_rep_count
    seed_list = list(range(1,num_sample_iter+1)) # The seeds we will use for each iteration of sampling.
                                                 # This is to make the results reproducible.
    # Identify the indices of the over-represented and under-represented class
    # and get the respective data in X_train.
    over_rep_indices = list(np.where(np.array(y_train) == over_rep_class)[0])
    X_train_over_rep = X_train.iloc[over_rep_indices]
    under_rep_indices = list(np.where(np.array(y_train) == under_rep_class)[0])
    X_train_under_rep = X_train.iloc[under_rep_indices]
   
    # Perform sampling
    for seed_i in seed_list:
#         print (f"Sampling iteration {seed_i}...", flush=True, end='')
#         print (sample_size)
#         print (len(X_train_over_rep))
#         print (len(X_train_under_rep))
        X_train_sample_i_over_rep= X_train_over_rep.sample(n=sample_size, replace=False, random_state=seed_i)
        y_train_sample_i_over_rep = [over_rep_class] * sample_size
        # Consolidate the training feature data for the under represented class
        # and over-represented class into a single data frame
        X_train_sample_i = X_train_sample_i_over_rep.append(X_train_under_rep)
        
        ind_i_list = X_train_sample_i.index.tolist()
    #     print (ind_i_list)
    #     break
        if len(train_index_list) == 0:
            train_index_list = ind_i_list
        else:    
            train_index_list.extend(ind_i_list)
        
        # Turn off reset index for de-bugging
        X_train_sample_i.reset_index(inplace=True, drop=True)
        # Consolidate the training labels for the under represented
        # and over represented classes
        y_train_sample_i = y_train_sample_i_over_rep  + [under_rep_class] * sample_size
        
        # Shuffle the rows. Otherwise the Top N rows will be over-rep class and 
        # bottom N rows will be under-rep class.
        X_train_sample_i_shuffled = X_train_sample_i.sample(frac=1, random_state=seed_i)
        shuffled_indices = X_train_sample_i_shuffled.index.to_list()
        #
        y_train_sample_i_shuffled = [y_train_sample_i[ind_i] for ind_i in shuffled_indices]
        train_samples[seed_i] = {}
        train_samples[seed_i]['X_train'] = X_train_sample_i_shuffled
        train_samples[seed_i]['y_train'] = y_train_sample_i_shuffled
    return train_samples, train_index_list
 
def get_ensemble_prediction(y_scores_all_models):
    """
    For a given testing point, get the median score across all the models.
    If the median score is >= 0.5, then label 1 else label 0. 
    """
    median_scores = list(y_scores_all_models.median(axis=1))
    median_labels = list(map(lambda x: 0 if (round(x,2) < 0.5) else 1, median_scores))
    return median_labels


def get_ensemble_prediction_scores(y_scores_all_models):
    """
    Return the median scores across all the models for a given test 
    data point.
    """
    median_scores = list(y_scores_all_models.median(axis=1))
    median_scores = [round(score_i,2) for score_i in median_scores]
    return median_scores


## Performance without VCFgenie and with a lower and upper bound - <b> <font color='green'> FM models </b> </font>

### Define input and output files

In [10]:
## Create output directory
outdir = '../results/'
Path(outdir).mkdir(parents=True, exist_ok=True)

# The input vcf .csv file
snv_sbs_file = '../data/SNV_data_wo_vcf_genie.csv'

# The output file that will include the performance metrics
perf_out_file_xgb = outdir + 'performance_FM.csv'

### Define parameters and conditions

In [11]:
af_lower = [0.01,0.02,0.05,0.1]
af_upper = [0.5] # Updated to only consider a single, strict upper VAF limit of 50%.
                 # Any SNV with VAF > 50% is a major variant and will be excluded.
                 # We will only consider SNVs with VAF < 50%.
feature_cat = ['Moderate', 'Strict', 'Exhaustive']

# Below, we convert the replicate frequencies into their numerical values.
# 1 => Replicate frequency 3/3
# 0.67 => Replicate frequency 2/3
# 0.33 => Replicate frequency 1/3
true_var = [[1],[1,0.67]]
false_var = [[0.33], [0.33, 0.67]] # Skip if 0.67 is in both true_var and false_var

### Define feature categories as a dictionary

In [12]:
feat_cat_dict = {'Moderate': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','GQ','MLLD','QUAL','REFB',
                'REVB','SAF','SAR','SRF','SRR','SSSB','STB','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Strict': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','MLLD','QUAL','REFB','REVB','SSSB','VARB', 
                           '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Exhaustive': ['AO','DP','FAO','FDP','FRO','FSAF','FSAR','FSRF','FSRR','FWDB',
                'FXX','GQ','HRUN','LEN','MLLD','QD','QUAL','RBI','REFB','REVB',
                'RO','SAF','SAR','SRF','SRR','SSSB','STB','STBP','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT',
                '3_PRIME_NUCLEOTIDE_CONTEXT']
                }


### Generate combinations of conditions

In [13]:
param_combinations = list(product(af_lower, af_upper, feature_cat, true_var, false_var))
print (param_combinations)
print (f"Total combinations = {len(param_combinations)}")

[(0.01, 0.5, 'Moderate', [1], [0.33]), (0.01, 0.5, 'Moderate', [1], [0.33, 0.67]), (0.01, 0.5, 'Moderate', [1, 0.67], [0.33]), (0.01, 0.5, 'Moderate', [1, 0.67], [0.33, 0.67]), (0.01, 0.5, 'Strict', [1], [0.33]), (0.01, 0.5, 'Strict', [1], [0.33, 0.67]), (0.01, 0.5, 'Strict', [1, 0.67], [0.33]), (0.01, 0.5, 'Strict', [1, 0.67], [0.33, 0.67]), (0.01, 0.5, 'Exhaustive', [1], [0.33]), (0.01, 0.5, 'Exhaustive', [1], [0.33, 0.67]), (0.01, 0.5, 'Exhaustive', [1, 0.67], [0.33]), (0.01, 0.5, 'Exhaustive', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Moderate', [1], [0.33]), (0.02, 0.5, 'Moderate', [1], [0.33, 0.67]), (0.02, 0.5, 'Moderate', [1, 0.67], [0.33]), (0.02, 0.5, 'Moderate', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Strict', [1], [0.33]), (0.02, 0.5, 'Strict', [1], [0.33, 0.67]), (0.02, 0.5, 'Strict', [1, 0.67], [0.33]), (0.02, 0.5, 'Strict', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Exhaustive', [1], [0.33]), (0.02, 0.5, 'Exhaustive', [1], [0.33, 0.67]), (0.02, 0.5, 'Exhaustive', [1, 0.67], [0.3

<b><i> Note that for some combinations in the above, the replicate frequency 0.67 (i.e., SNV occurring in 2/3 replicates) is used both for true variants and false variants. Example of such a combination is (0.1, 0.5, 'Exhaustive', [1, 0.67], [0.33, 0.67]). We will exclude such combinations in our analysis, which will result in 36 combinations (4x1x3x3) in total.

### Run Xtreme Gradient Boosting iteratively on these conditions

In [7]:
df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

# We will consider the SNVs in each replicate set only once and perform
# training and testing for each replicate set independently.
df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')

# To eliminate the performance bias due to sampling, we will use a set of seeds to perform
# random sampling. Using just a single seed might bias the testing data sampled
# and a very good performance on just one iteration of testing dataset is highly indicative of
# performance bias.
# In each iteration, 25 samples will be randomly selected for training and 6 for testing.

# Changing to perform 50 iterations
seed_list = [10,20,3247,24,4501,25,79,299,1001,287,497,300,999,6,1,217,21,29,47,187,701,702,703,704,801,802,804,810,101,102,103,104,105,106,121,
             122,123,124,125,127,141,142,143,144,145,191,192,193,194,195]

for seed_i in seed_list:
    # Randomly select 6 samples for testing
    # Get all the sample IDs
    all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

    # Set the random sampling seed
    random.seed(seed_i)

    # We will pick 6 testing samples and make sure snvs from these samples
    # are not included in the training set.
    testing_set_samples = random.sample(all_sample_ids, k=6)
    print (f"testing samples = {testing_set_samples}")
    df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_testing.reset_index(drop=True, inplace=True)
    df_snvs_training.reset_index(drop=True, inplace=True)

    print (f"Running predictions for all parameter/feature combinations with seed {seed_i} ...", end="", flush=True)
    df_perf_seed_i = pd.DataFrame()
    for comb_i in param_combinations:
        af_lower_i, af_upper_i, feat_cat_i, true_var_i, false_var_i = comb_i
        # if not (af_lower_i == 0.1 and af_upper_i == 0.5 and feat_cat_i == 'Moderate' and true_var_i == [1] and false_var_i == [0.33] and seed_i == 24 and rep_set_i == 'Rep_set_2'):
        #     continue

        # Commenting these print statements to reduce the amount of output text in the notebook
        # Uncomment if you desire to see the combinations
        print (f"Evaluating with parameters: af_lower_i={af_lower_i}, af_upper_i={af_upper_i}, feat_cat_i={feat_cat_i}, true_var_i={true_var_i}, false_var_i={false_var_i}")

        # If the 2/3 variant is included in both the true variant and false variant
        # then exclude that combination
        if 0.67 in true_var_i and 0.67 in false_var_i:
            continue
        feature_cols = feat_cat_dict[feat_cat_i]

        # Prepare training data for this parameters combination
        df_snvs_training_comb_i = df_snvs_training.copy()
        df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['AF'] > af_lower_i) & 
                                                       (df_snvs_training_comb_i['AF'] < af_upper_i)]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for training
        X = df_snvs_training_comb_i[feature_cols]
        y = df_snvs_training_comb_i[target_cols].tolist()

        # Create balanced datasets
        train_sample, train_index_list = create_balanced_datasets(X,y)

        # Prepare the testing dataset for the current parameters combination
        df_snvs_testing_comb_i = df_snvs_testing.copy()
        df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['AF'] > af_lower_i) & 
                                                       (df_snvs_testing_comb_i['AF'] < af_upper_i)]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for testing
        X_test = df_snvs_testing_comb_i[feature_cols]
        y_test = df_snvs_testing_comb_i[target_cols].tolist()

        # Train XGB model
        num_jobs = 6
        n_trees = 100
        mdepth = 10
        xgb_model = XGBClassifier(use_label_encoder=False, 
                              booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                              n_estimators=n_trees, # number of trees, default = 100
                              eta=0.2, # this is learning rate, default = 0.3
                              max_depth=mdepth, # maximum depth of the tree, default = 6
                              gamma = 1, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                              reg_lambda = 1, # regularization parameter, defautl = 1
                              eval_metric = 'logloss')
        fpr_tpr = [] # Store all the fpr, tpr values for each model 
        auc_scores = []
        df_y_pred_all_models = pd.DataFrame()
        df_y_scores_all_models = pd.DataFrame()
        df_all_cv_scores = pd.DataFrame()
        df_feature_importances_all_models = pd.DataFrame()
        for model_i in list(train_sample.keys()):
            X_train_model_i = train_sample[model_i]['X_train']
            y_train_model_i = train_sample[model_i]['y_train']
    #         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
            X_train_model_i = X_train_model_i.apply(pd.to_numeric)
            xgb_model.fit(X_train_model_i,y_train_model_i)

            # Predict using model_i on the test data
            y_pred = xgb_model.predict(X_test)
            df_y_pred_all_models[f"Model_{model_i}"] = y_pred
            y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
            df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
            false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
            fpr_tpr.append([false_positive,true_positive])
            # Calculate ROC only if both true and false labels are present.
            # Otherwise, the function roc_auc_score returns error.
            if len(set(y_test)) == 2:
                auc_test = roc_auc_score(y_test, y_pred)
            else:
                auc_test = math.nan
            auc_scores.append(auc_test)
        # Calculate metrics only if y_test has both true and false iSNVs labels.
        # Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
        if len(set(y_test)) == 2:
            # Get the consensus predictions from all models
            y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

            # Calculate metrics on the consensus prediction
            auc_test = roc_auc_score(y_test, y_pred_consensus)

            # Calculate the F1 score
            F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

            # Calculate the mean-squared error
            mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

            # Calculate the accuracy
            accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

            # Calculate the MCC
            mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

            # Print the performance metrics on test data
            print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

            # Get the confusion matrix
            cm = confusion_matrix(y_test, y_pred_consensus)

            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
        else:
            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
            auc_test = math.nan
            mcc_test = math.nan
            F1_score_test = math.nan
            accuracy_test = math.nan
            mse_test = math.nan

        cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
                'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
                'F1_score', 'Accuracy', 'MSE']

        # Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
        df_tmp = pd.DataFrame(data=[[seed_i, true_var_i, false_var_i,af_lower_i,af_upper_i,feat_cat_i, 
                                     tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                                     accuracy_test, mse_test]], columns=cols)
        df_perf_seed_i = df_perf_seed_i.append(df_tmp)
    df_perf_out_xgb = df_perf_out_xgb.append(df_perf_seed_i)
    print ("Done!")
df_perf_out_xgb.to_csv(perf_out_file_xgb, index=False)
df_perf_out_xgb 

testing samples = ['sample24', 'sample5', 'sample10', 'sample15', 'sample1', 'sample31']
Running predictions for all parameter/feature combinations with seed 10 ...Evaluating with parameters: af_lower_i=0.01, af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 2930
	Under-represented class = 1, count = 377
	Will proceed by creating balanced training data.
AUC = 0.785 , F1 score = 0.441 , Mean-squared error = 0.174 , Accuracy = 0.826 , Matthews correlation coefficient = 0.402
Evaluating with parameters: af_lower_i=0.01, af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33, 0.67]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3412
	Under-represented class = 1, count = 377
	Will proceed by creating balanced training data.
AUC = 0.763 , F1 score = 0.389 , Mean-squared error = 0.19 , Accuracy = 0.81 , Matthews correlat

Unnamed: 0,Seed,True_Var_Def,False_Var_Def,VAF_Lower_Limit,VAF_Upper_Limit,Feature_Category,Num_True_Var_Testing,Num_False_Var_Testing,TP,FP,TN,FN,AUC,MCC,F1_score,Accuracy,MSE
0,10,[1],[0.33],0.01,0.5,Moderate,68,662,50,109,553,18,0.785321,0.402,0.441,0.826,0.174
0,10,[1],"[0.33, 0.67]",0.01,0.5,Moderate,68,726,48,131,595,20,0.762721,0.352,0.389,0.810,0.190
0,10,"[1, 0.67]",[0.33],0.01,0.5,Moderate,132,662,81,165,497,51,0.682196,0.293,0.429,0.728,0.272
0,10,[1],[0.33],0.01,0.5,Strict,68,662,53,105,557,15,0.810401,0.438,0.469,0.836,0.164
0,10,[1],"[0.33, 0.67]",0.01,0.5,Strict,68,726,51,135,591,17,0.782025,0.373,0.402,0.809,0.191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,195,[1],"[0.33, 0.67]",0.10,0.5,Strict,9,109,2,73,36,7,0.276249,-0.247,0.048,0.322,0.678
0,195,"[1, 0.67]",[0.33],0.10,0.5,Strict,15,103,9,59,44,6,0.513592,0.018,0.217,0.449,0.551
0,195,[1],[0.33],0.10,0.5,Exhaustive,9,103,4,62,41,5,0.421251,-0.087,0.107,0.402,0.598
0,195,[1],"[0.33, 0.67]",0.10,0.5,Exhaustive,9,109,3,60,49,6,0.391437,-0.116,0.083,0.441,0.559


## Performance with results from VCFgenie, but without a lower bound on VAF - <b> <font color='orange'> VM models </font> </b>

### Define input and output files

In [8]:
# The input vcf .csv file
snv_sbs_file = '../data/SNV_data_with_vcf_genie.csv'
perf_out_file_xgb = outdir + 'performance_VM.csv'

### Define parameters and conditions

In [9]:
af_upper = [0.5]
feature_cat = ['Moderate', 'Strict', 'Exhaustive']
true_var = [[1],[1,0.67]]
false_var = [[0.33], [0.33, 0.67]] # Skip if 0.67 is in both true_var and false_var


### Define features categories as a dictionary

In [10]:
feat_cat_dict = {'Moderate': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','GQ','MLLD','QUAL','REFB',
                'REVB','SAF','SAR','SRF','SRR','SSSB','STB','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Strict': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','MLLD','QUAL','REFB','REVB','SSSB','VARB', 
                           '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Exhaustive': ['AO','DP','FAO','FDP','FRO','FSAF','FSAR','FSRF','FSRR','FWDB',
                'FXX','GQ','HRUN','LEN','MLLD','QD','QUAL','RBI','REFB','REVB',
                'RO','SAF','SAR','SRF','SRR','SSSB','STB','STBP','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT',
                '3_PRIME_NUCLEOTIDE_CONTEXT']
                }


### Generate combinations of conditions

In [11]:
param_combinations = list(product(af_upper, feature_cat, true_var, false_var))
print (param_combinations)
print (f"Total combinations = {len(param_combinations)}")

[(0.5, 'Moderate', [1], [0.33]), (0.5, 'Moderate', [1], [0.33, 0.67]), (0.5, 'Moderate', [1, 0.67], [0.33]), (0.5, 'Moderate', [1, 0.67], [0.33, 0.67]), (0.5, 'Strict', [1], [0.33]), (0.5, 'Strict', [1], [0.33, 0.67]), (0.5, 'Strict', [1, 0.67], [0.33]), (0.5, 'Strict', [1, 0.67], [0.33, 0.67]), (0.5, 'Exhaustive', [1], [0.33]), (0.5, 'Exhaustive', [1], [0.33, 0.67]), (0.5, 'Exhaustive', [1, 0.67], [0.33]), (0.5, 'Exhaustive', [1, 0.67], [0.33, 0.67])]
Total combinations = 12


### Run Xtreme Gradient Boosting iteratively on these conditions

In [12]:
df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

# Read the file that identifies those samples
df_sample_mut_cat = pd.read_csv('../data/sample_selection_list.csv')

# Drop unnecessary columns
df_sample_mut_cat.drop(columns=['Sample_ID'], inplace=True)
df_sample_mut_cat.rename(columns={'Sample_ID_dummy': 'sample_id'}, inplace=True)

df_snv_sbs = df_snv_sbs.merge(df_sample_mut_cat, on='sample_id', how='left')

df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')

# To eliminate the performance bias due to sampling, we will use a set of seeds to perform
# random sampling. Using just a single seed might bias the testing data sampled
# and a very good performance on just one iteration of testing dataset is highly indicative of
# performance bias.
# In each iteration, 25 samples will be randomly selected for training and 6 for testing.


# Changing to perform 50 iterations
seed_list = [10,20,3247,24,4501,25,79,299,1001,287,497,300,999,6,1,217,21,29,47,187,701,702,703,704,801,802,804,810,101,102,103,104,105,106,121,
             122,123,124,125,127,141,142,143,144,145,191,192,193,194,195]


for seed_i in seed_list:
    # Randomly select 6 samples for testing 
    # Get all the sample IDs
    all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

    # Set the random sampling seed
    random.seed(seed_i)

    # We will pick 6 samples for testing
    testing_set_samples = random.sample(all_sample_ids, k=6)
    print (f"testing samples = {testing_set_samples}")
    df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_testing.reset_index(drop=True, inplace=True)
    df_snvs_training.reset_index(drop=True, inplace=True)

    print (f"Running predictions for all parameter/feature combinations with seed {seed_i}...", end="", flush=True)
    df_perf_seed_i = pd.DataFrame()
    for comb_i in param_combinations:
        af_upper_i, feat_cat_i, true_var_i, false_var_i = comb_i
        print (f"Evaluating with parameters: af_upper_i={af_upper_i}, feat_cat_i={feat_cat_i}, true_var_i={true_var_i}, false_var_i={false_var_i}")

        # If the 2/3 variant is included in both the true variant and false variant
        # then exclude that combination
        if 0.67 in true_var_i and 0.67 in false_var_i:
            continue
        feature_cols = feat_cat_dict[feat_cat_i]

        # Prepare training data for this parameters combination
        df_snvs_training_comb_i = df_snvs_training.copy()
        df_snvs_training_comb_i = df_snvs_training_comb_i.loc[df_snvs_training_comb_i['AF'] < af_upper_i]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for training
        X = df_snvs_training_comb_i[feature_cols]
        y = df_snvs_training_comb_i[target_cols].tolist()

        # Create balanced datasets
        train_sample, train_index_list = create_balanced_datasets(X,y)

        # Prepare the testing dataset for the current paramters combination
        df_snvs_testing_comb_i = df_snvs_testing.copy()
        df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[df_snvs_testing_comb_i['AF'] < af_upper_i]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for testing
        X_test = df_snvs_testing_comb_i[feature_cols]
        y_test = df_snvs_testing_comb_i[target_cols].tolist()

        # Train XGB model
        num_jobs = 6
        n_trees = 100
        mdepth = 10
        xgb_model = XGBClassifier(use_label_encoder=False, 
                              booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                              n_estimators=n_trees, # number of trees, default = 100
                              eta=0.2, # this is learning rate, default = 0.3
                              max_depth=mdepth, # maximum depth of the tree, default = 6
                              gamma = 1, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                              reg_lambda = 1, # regularization parameter, defautl = 1
                              eval_metric = 'logloss')
        fpr_tpr = [] # Store all the fpr, tpr values for each model 
        auc_scores = []
        df_y_pred_all_models = pd.DataFrame()
        df_y_scores_all_models = pd.DataFrame()
        df_all_cv_scores = pd.DataFrame()
        df_feature_importances_all_models = pd.DataFrame()
        for model_i in list(train_sample.keys()):
            X_train_model_i = train_sample[model_i]['X_train']
            y_train_model_i = train_sample[model_i]['y_train']
    #         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
            X_train_model_i = X_train_model_i.apply(pd.to_numeric)
            xgb_model.fit(X_train_model_i,y_train_model_i)

            # Predict using model_i on the test data
            y_pred = xgb_model.predict(X_test)
            df_y_pred_all_models[f"Model_{model_i}"] = y_pred
            y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
            df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
            false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
            fpr_tpr.append([false_positive,true_positive])
            # Calculate ROC only if both true and false labels are present.
            # Otherwise, the function roc_auc_score returns error.
            if len(set(y_test)) == 2:
                auc_test = roc_auc_score(y_test, y_pred)
            else:
                auc_test = math.nan
            auc_scores.append(auc_test)
        # Calculate metrics only if y_test has both true and false iSNVs labels.
        # Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
        if len(set(y_test)) == 2:
            # Get the consensus predictions from all models
            y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

            # Calculate metrics on the consensus prediction
            auc_test = roc_auc_score(y_test, y_pred_consensus)

            # Calculate the F1 score
            F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

            # Calculate the mean-squared error
            mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

            # Calculate the accuracy
            accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

            # Calculate the MCC
            mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

            # Print the performance metrics on test data
            # Uncomment to see the metrics for each combination
            print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

            # Get the confusion matrix
            cm = confusion_matrix(y_test, y_pred_consensus)

            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
        else:
            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
            auc_test = math.nan
            mcc_test = math.nan
            F1_score_test = math.nan
            accuracy_test = math.nan
            mse_test = math.nan

        cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
                'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
                'F1_score', 'Accuracy', 'MSE']

        # Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
        df_tmp = pd.DataFrame(data=[[seed_i, true_var_i, false_var_i,math.nan,af_upper_i,feat_cat_i, 
                                     tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                                     accuracy_test, mse_test]], columns=cols)
        df_perf_seed_i = df_perf_seed_i.append(df_tmp)
        #break
    df_perf_out_xgb = df_perf_out_xgb.append(df_perf_seed_i)
    print ("Done!")
df_perf_out_xgb.to_csv(perf_out_file_xgb, index=False)
df_perf_out_xgb 

testing samples = ['sample24', 'sample5', 'sample10', 'sample15', 'sample1', 'sample31']
Running predictions for all parameter/feature combinations with seed 10...Evaluating with parameters: af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 2924
	Under-represented class = 1, count = 395
	Will proceed by creating balanced training data.
AUC = 0.782 , F1 score = 0.467 , Mean-squared error = 0.156 , Accuracy = 0.844 , Matthews correlation coefficient = 0.421
Evaluating with parameters: af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33, 0.67]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3407
	Under-represented class = 1, count = 395
	Will proceed by creating balanced training data.
AUC = 0.755 , F1 score = 0.4 , Mean-squared error = 0.181 , Accuracy = 0.819 , Matthews correlation coefficient = 0.355
Evaluating 

Unnamed: 0,Seed,True_Var_Def,False_Var_Def,VAF_Lower_Limit,VAF_Upper_Limit,Feature_Category,Num_True_Var_Testing,Num_False_Var_Testing,TP,FP,TN,FN,AUC,MCC,F1_score,Accuracy,MSE
0,10,[1],[0.33],,0.5,Moderate,71,661,50,93,568,21,0.781765,0.421,0.467,0.844,0.156
0,10,[1],"[0.33, 0.67]",,0.5,Moderate,71,725,48,121,604,23,0.754580,0.355,0.400,0.819,0.181
0,10,"[1, 0.67]",[0.33],,0.5,Moderate,135,661,83,155,506,52,0.690161,0.312,0.445,0.740,0.260
0,10,[1],[0.33],,0.5,Strict,71,661,54,101,560,17,0.803882,0.440,0.478,0.839,0.161
0,10,[1],"[0.33, 0.67]",,0.5,Strict,71,725,53,126,599,18,0.786343,0.391,0.424,0.819,0.181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,195,[1],"[0.33, 0.67]",,0.5,Strict,71,443,51,166,277,20,0.671796,0.240,0.354,0.638,0.362
0,195,"[1, 0.67]",[0.33],,0.5,Strict,92,422,72,179,243,20,0.679219,0.275,0.420,0.613,0.387
0,195,[1],[0.33],,0.5,Exhaustive,71,422,52,152,270,19,0.686102,0.265,0.378,0.653,0.347
0,195,[1],"[0.33, 0.67]",,0.5,Exhaustive,71,443,46,157,286,25,0.646743,0.207,0.336,0.646,0.354


## Performance with results from VCFgenie and with a lower VAF and upper VAF cutoff - <b> <font color='royalblue'> FVM models

### Define input and output files

In [13]:
# The input vcf .csv file
outdir = '../results/'
snv_sbs_file = '../data/SNV_data_with_vcf_genie.csv'
perf_out_file_xgb = outdir + 'performance_FVM.csv'

### Define parameter and conditions

In [14]:
af_lower = [0.01,0.02,0.05,0.1]
af_upper = [0.5]
feature_cat = ['Moderate', 'Strict', 'Exhaustive']
true_var = [[1],[1,0.67]]
false_var = [[0.33], [0.33, 0.67]] # Skip if 0.67 is in both true_var and false_var

### Define feature categories as a dictionary

In [15]:
feat_cat_dict = {'Moderate': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','GQ','MLLD','QUAL','REFB',
                'REVB','SAF','SAR','SRF','SRR','SSSB','STB','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Strict': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','MLLD','QUAL','REFB','REVB','SSSB','VARB', 
                           '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Exhaustive': ['AO','DP','FAO','FDP','FRO','FSAF','FSAR','FSRF','FSRR','FWDB',
                'FXX','GQ','HRUN','LEN','MLLD','QD','QUAL','RBI','REFB','REVB',
                'RO','SAF','SAR','SRF','SRR','SSSB','STB','STBP','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT',
                '3_PRIME_NUCLEOTIDE_CONTEXT']
                }


### Generate combinations of conditions

In [16]:
param_combinations = list(product(af_lower, af_upper, feature_cat, true_var, false_var))
print (param_combinations)
print (f"Total combinations = {len(param_combinations)}")

[(0.01, 0.5, 'Moderate', [1], [0.33]), (0.01, 0.5, 'Moderate', [1], [0.33, 0.67]), (0.01, 0.5, 'Moderate', [1, 0.67], [0.33]), (0.01, 0.5, 'Moderate', [1, 0.67], [0.33, 0.67]), (0.01, 0.5, 'Strict', [1], [0.33]), (0.01, 0.5, 'Strict', [1], [0.33, 0.67]), (0.01, 0.5, 'Strict', [1, 0.67], [0.33]), (0.01, 0.5, 'Strict', [1, 0.67], [0.33, 0.67]), (0.01, 0.5, 'Exhaustive', [1], [0.33]), (0.01, 0.5, 'Exhaustive', [1], [0.33, 0.67]), (0.01, 0.5, 'Exhaustive', [1, 0.67], [0.33]), (0.01, 0.5, 'Exhaustive', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Moderate', [1], [0.33]), (0.02, 0.5, 'Moderate', [1], [0.33, 0.67]), (0.02, 0.5, 'Moderate', [1, 0.67], [0.33]), (0.02, 0.5, 'Moderate', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Strict', [1], [0.33]), (0.02, 0.5, 'Strict', [1], [0.33, 0.67]), (0.02, 0.5, 'Strict', [1, 0.67], [0.33]), (0.02, 0.5, 'Strict', [1, 0.67], [0.33, 0.67]), (0.02, 0.5, 'Exhaustive', [1], [0.33]), (0.02, 0.5, 'Exhaustive', [1], [0.33, 0.67]), (0.02, 0.5, 'Exhaustive', [1, 0.67], [0.3

### Run Xtreme Gradient Boosting iteratively in these conditions

In [17]:
df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')

# To eliminate the performance bias due to sampling, we will use a set of seeds to perform
# random sampling. Using just a single seed might bias the testing data sampled
# and a very good performance on just one iteration of testing dataset is highly indicative of
# performance bias.
# In each iteration, 25 samples will be randomly selected for training and 6 for testing.

# Changing to perform 50 iterations
seed_list = [10,20,3247,24,4501,25,79,299,1001,287,497,300,999,6,1,217,21,29,47,187,701,702,703,704,801,802,804,810,101,102,103,104,105,106,121,
             122,123,124,125,127,141,142,143,144,145,191,192,193,194,195]

for seed_i in seed_list:
    # Get all the sample IDs
    all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

    # Set the random sampling seed
    random.seed(seed_i)

    # We will pick 6 samples for testing
    testing_set_samples = random.sample(all_sample_ids, k=6)

    print (f"testing samples = {testing_set_samples}")
    df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
    df_snvs_testing.reset_index(drop=True, inplace=True)
    df_snvs_training.reset_index(drop=True, inplace=True)

    print (f"Running predictions for all parameter/feature combinations with seed {seed_i}...", end="", flush=True)
    df_perf_seed_i = pd.DataFrame()
    for comb_i in param_combinations:
        af_lower_i, af_upper_i, feat_cat_i, true_var_i, false_var_i = comb_i
        print (f"Evaluating with parameters: af_lower_i={af_lower_i}, af_upper_i={af_upper_i}, feat_cat_i={feat_cat_i}, true_var_i={true_var_i}, false_var_i={false_var_i}")

        # If the 2/3 variant is included in both the true variant and false variant
        # then exclude that combination
        if 0.67 in true_var_i and 0.67 in false_var_i:
            continue
        feature_cols = feat_cat_dict[feat_cat_i]

        # Prepare training data for this parameters combination
        df_snvs_training_comb_i = df_snvs_training.copy()
        df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['AF'] > af_lower_i) & 
                                                       (df_snvs_training_comb_i['AF'] < af_upper_i)]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for training
        X = df_snvs_training_comb_i[feature_cols]
        y = df_snvs_training_comb_i[target_cols].tolist()

        # Create balanced datasets
        train_sample, train_index_list = create_balanced_datasets(X,y)

        # Prepare the testing dataset for the current paramters combination
        df_snvs_testing_comb_i = df_snvs_testing.copy()
        df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['AF'] > af_lower_i) & 
                                                       (df_snvs_testing_comb_i['AF'] < af_upper_i)]
        if len(true_var_i) == 1 and len(false_var_i) == 1:
            df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var_i[0]) |
                                                (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var_i[0])]

        df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)
        if len(true_var_i) == 1:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0]) else 0 ))
        else:
            overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if (x == true_var_i[0] or x == true_var_i[1]) else 0 ))

        # Create a binary response column to identify a variant as true variant (1) or false variant (0)
        df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
        target_cols = 'OVERLAP_CATEGORY' # Response column 

        # Define X and y for testing
        X_test = df_snvs_testing_comb_i[feature_cols]
        y_test = df_snvs_testing_comb_i[target_cols].tolist()

        # Train XGB model
        num_jobs = 6
        n_trees = 100
        mdepth = 10
        xgb_model = XGBClassifier(use_label_encoder=False, 
                              booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                              n_estimators=n_trees, # number of trees, default = 100
                              eta=0.2, # this is learning rate, default = 0.3
                              max_depth=mdepth, # maximum depth of the tree, default = 6
                              gamma = 1, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                              reg_lambda = 1, # regularization parameter, defautl = 1
                              eval_metric = 'logloss')
        fpr_tpr = [] # Store all the fpr, tpr values for each model 
        auc_scores = []
        df_y_pred_all_models = pd.DataFrame()
        df_y_scores_all_models = pd.DataFrame()
        df_all_cv_scores = pd.DataFrame()
        df_feature_importances_all_models = pd.DataFrame()
        for model_i in list(train_sample.keys()):
            X_train_model_i = train_sample[model_i]['X_train']
            y_train_model_i = train_sample[model_i]['y_train']
    #         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
            X_train_model_i = X_train_model_i.apply(pd.to_numeric)
            xgb_model.fit(X_train_model_i,y_train_model_i)

            # Predict using model_i on the test data
            y_pred = xgb_model.predict(X_test)
            df_y_pred_all_models[f"Model_{model_i}"] = y_pred
            y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
            df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
            false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
            fpr_tpr.append([false_positive,true_positive])
            # Calculate ROC only if both true and false labels are present.
            # Otherwise, the function roc_auc_score returns error.
            if len(set(y_test)) == 2:
                auc_test = roc_auc_score(y_test, y_pred)
            else:
                auc_test = math.nan
            auc_scores.append(auc_test)
        # Calculate metrics only if y_test has both true and false iSNVs labels.
        # Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
        if len(set(y_test)) == 2:
            # Get the consensus predictions from all models
            y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

            # Calculate metrics on the consensus prediction
            auc_test = roc_auc_score(y_test, y_pred_consensus)

            # Calculate the F1 score
            F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

            # Calculate the mean-squared error
            mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

            # Calculate the accuracy
            accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

            # Calculate the MCC
            mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

            # Print the performance metrics on test data
            # Uncomment to see the metrics for each combination
            print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

            # Get the confusion matrix
            cm = confusion_matrix(y_test, y_pred_consensus)

            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
        else:
            # Get the indices of false positives
            false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true positives
            true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
            # Get the indices of true negatives
            true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
            # Get the indices of false negatives
            false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

            fp = len(false_positive_ind)
            tp = len(true_positive_ind)
            tn = len(true_negative_ind)
            fn = len(false_negative_ind)
            auc_test = math.nan
            mcc_test = math.nan
            F1_score_test = math.nan
            accuracy_test = math.nan
            mse_test = math.nan

        cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
                'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
                'F1_score', 'Accuracy', 'MSE']

        # Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
        df_tmp = pd.DataFrame(data=[[seed_i, true_var_i, false_var_i,af_lower_i,af_upper_i,feat_cat_i, 
                                     tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                                     accuracy_test, mse_test]], columns=cols)
        df_perf_seed_i = df_perf_seed_i.append(df_tmp)
        #break
    df_perf_out_xgb = df_perf_out_xgb.append(df_perf_seed_i)
    print ("Done!")
df_perf_out_xgb.to_csv(perf_out_file_xgb, index=False)
df_perf_out_xgb 

testing samples = ['sample24', 'sample5', 'sample10', 'sample15', 'sample1', 'sample31']
Running predictions for all parameter/feature combinations with seed 10...Evaluating with parameters: af_lower_i=0.01, af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 2923
	Under-represented class = 1, count = 377
	Will proceed by creating balanced training data.
AUC = 0.78 , F1 score = 0.439 , Mean-squared error = 0.171 , Accuracy = 0.829 , Matthews correlation coefficient = 0.398
Evaluating with parameters: af_lower_i=0.01, af_upper_i=0.5, feat_cat_i=Moderate, true_var_i=[1], false_var_i=[0.33, 0.67]
	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3404
	Under-represented class = 1, count = 377
	Will proceed by creating balanced training data.
AUC = 0.748 , F1 score = 0.386 , Mean-squared error = 0.181 , Accuracy = 0.819 , Matthews correlat

Unnamed: 0,Seed,True_Var_Def,False_Var_Def,VAF_Lower_Limit,VAF_Upper_Limit,Feature_Category,Num_True_Var_Testing,Num_False_Var_Testing,TP,FP,TN,FN,AUC,MCC,F1_score,Accuracy,MSE
0,10,[1],[0.33],0.01,0.5,Moderate,68,661,49,106,555,19,0.780113,0.398,0.439,0.829,0.171
0,10,[1],"[0.33, 0.67]",0.01,0.5,Moderate,68,724,45,120,604,23,0.748009,0.342,0.386,0.819,0.181
0,10,"[1, 0.67]",[0.33],0.01,0.5,Moderate,131,661,77,160,501,54,0.672864,0.281,0.418,0.730,0.270
0,10,[1],[0.33],0.01,0.5,Strict,68,661,53,98,563,15,0.815576,0.453,0.484,0.845,0.155
0,10,[1],"[0.33, 0.67]",0.01,0.5,Strict,68,724,48,125,599,20,0.766615,0.362,0.398,0.817,0.183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,195,[1],"[0.33, 0.67]",0.10,0.5,Strict,9,109,2,73,36,7,0.276249,-0.247,0.048,0.322,0.678
0,195,"[1, 0.67]",[0.33],0.10,0.5,Strict,15,103,9,59,44,6,0.513592,0.018,0.217,0.449,0.551
0,195,[1],[0.33],0.10,0.5,Exhaustive,9,103,4,62,41,5,0.421251,-0.087,0.107,0.402,0.598
0,195,[1],"[0.33, 0.67]",0.10,0.5,Exhaustive,9,109,3,60,49,6,0.391437,-0.116,0.083,0.441,0.559


## Calculate the cumulative performance metrics for each model separately

In [18]:
inputdir = '../results/'
outdir = '../results/cumulative_perf_metrics/'
Path(outdir).mkdir(parents=True, exist_ok=True)

file_list = glob.glob(inputdir + '/performance*')
print (file_list)
for file_i in file_list:
    infile_name = file_i.split('/')[-1]
    outfile = outdir + infile_name
    # Read the file
    df_file_i = pd.read_csv(file_i)
    
    # First, get the distinct parameter combinations in this file. 
    # We will do this by choosing all the rows for a single seed. 
    seed_val = 10
    df_param_uniq = df_file_i.copy()
    df_param_uniq = df_param_uniq.loc[(df_param_uniq['Seed'] == seed_val)]
    df_param_uniq.reset_index(drop=True, inplace=True)    
    
    # Now iterate over the rows of df_param_uniq and get the performance
    # metrics for each row across all the 10 seeds.
    df_cumul_perf = pd.DataFrame()
    for idx_i, row_i in df_param_uniq.iterrows():
        tv_def, fv_def, vaf_up, feat_cat = row_i['True_Var_Def'], row_i['False_Var_Def'], row_i['VAF_Upper_Limit'], row_i['Feature_Category']
        if 'VAF_Lower_Limit' in row_i.to_frame().transpose().columns and not math.isnan(row_i['VAF_Lower_Limit']):
            vaf_low = row_i['VAF_Lower_Limit']
        else:
            vaf_low = ''
        df_comb_i_all_perf = df_file_i.copy()
        if vaf_low:
            df_comb_i_all_perf = df_comb_i_all_perf.loc[(df_comb_i_all_perf['True_Var_Def'] == tv_def) &
                                                        (df_comb_i_all_perf['False_Var_Def'] == fv_def) &
                                                        (df_comb_i_all_perf['VAF_Lower_Limit'] == vaf_low) &
                                                        (df_comb_i_all_perf['VAF_Upper_Limit'] == vaf_up) &
                                                        (df_comb_i_all_perf['Feature_Category'] == feat_cat)]
        else:
            df_comb_i_all_perf = df_comb_i_all_perf.loc[(df_comb_i_all_perf['True_Var_Def'] == tv_def) &
                                                        (df_comb_i_all_perf['False_Var_Def'] == fv_def) &
                                                        (df_comb_i_all_perf['VAF_Upper_Limit'] == vaf_up) &
                                                        (df_comb_i_all_perf['Feature_Category'] == feat_cat)]
        df_comb_i_all_perf.reset_index(drop=True, inplace=True)
        # Get the cumulative performance metrics and also the counts
        num_tv, num_fv, tp, fp, tn, fn, auc, mcc, f1_score, acc, mse = df_comb_i_all_perf[['Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC',
                                                  'F1_score', 'Accuracy', 'MSE']].median()
        df_tmp_i = pd.DataFrame(data =[[tv_def, fv_def, vaf_low, vaf_up, feat_cat, round(num_tv), round(num_fv), round(tp), round(fp),
                                        round(tn), round(fn), auc, mcc, f1_score, acc, mse]],
                                columns = ['True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit', 
                                           'Feature_Category', 'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 
                                           'FP', 'TN', 'FN', 'AUC', 'MCC', 'F1_score', 'Accuracy', 'MSE'])
        df_cumul_perf = df_cumul_perf.append(df_tmp_i)
    df_cumul_perf.reset_index(drop=True, inplace=True)
    df_cumul_perf.to_csv(outfile, index=False)



['../results/performance_FM.csv', '../results/performance_FVM.csv', '../results/performance_VM.csv']


## Save the best performing models for each prediction strategy

For FM, VM and FVM prediction strategies, save the models corresponding to the optimum parameter/feature combination and the optimum hyperparameters that result in best performance over the 50 iterations of random training/testing.

In [4]:
feat_cat_dict = {'Moderate': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','GQ','MLLD','QUAL','REFB',
                'REVB','SAF','SAR','SRF','SRR','SSSB','STB','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Strict': ['FSAF','FSAR','FSRF','FSRR','FWDB','FXX','MLLD','QUAL','REFB','REVB','SSSB','VARB', 
                           '5_PRIME_NUCLEOTIDE_CONTEXT', '3_PRIME_NUCLEOTIDE_CONTEXT'],
                'Exhaustive': ['AO','DP','FAO','FDP','FRO','FSAF','FSAR','FSRF','FSRR','FWDB',
                'FXX','GQ','HRUN','LEN','MLLD','QD','QUAL','RBI','REFB','REVB',
                'RO','SAF','SAR','SRF','SRR','SSSB','STB','STBP','VARB', '5_PRIME_NUCLEOTIDE_CONTEXT',
                '3_PRIME_NUCLEOTIDE_CONTEXT']
                }

### Save models for FM strategy

In [7]:
model_dir = '../FM_models/'
Path(model_dir).mkdir(parents=True, exist_ok=True)

# The input vcf .csv file
snv_sbs_file = '../data/SNV_data_wo_vcf_genie.csv'

af_lower = 0.01
af_upper = 0.5 # Updated to only consider a single, strict upper VAF limit of 50%.
                 # Any SNV with VAF > 50% is a major variant and will be excluded.
                 # We will only consider SNVs with VAF < 50%.
feature_cat = 'Strict'
true_var = [1]
false_var = [0.33]

df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

# We will consider the SNVs in each replicate set only once and perform
# training and testing for each replicate set independently.
df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')


seed_i = 252 # The seed corresponding to the train/test iteration of hyperparameter tuning that
             # resulted in best performance.

# Get all the sample IDs
all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

# Set the random sampling seed
random.seed(seed_i)

# We will pick 6 testing samples and make sure snvs from these samples
# are not included in the training set.
testing_set_samples = random.sample(all_sample_ids, k=6)
print (f"testing samples = {testing_set_samples}")
df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_testing.reset_index(drop=True, inplace=True)
df_snvs_training.reset_index(drop=True, inplace=True)

print (f"Running predictions for all parameter/feature combinations with seed {seed_i} ...", end="", flush=True)
feature_cols = feat_cat_dict[feature_cat]

# Prepare training data for this parameters combination
df_snvs_training_comb_i = df_snvs_training.copy()
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['AF'] > af_lower) & 
                                                      (df_snvs_training_comb_i['AF'] < af_upper)]
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                      (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for training
X = df_snvs_training_comb_i[feature_cols]
y = df_snvs_training_comb_i[target_cols].tolist()

# Create balanced datasets
train_sample, train_index_list = create_balanced_datasets(X,y)

# Prepare the testing dataset for the current paramters combination
df_snvs_testing_comb_i = df_snvs_testing.copy()
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['AF'] > af_lower) & 
                                               (df_snvs_testing_comb_i['AF'] < af_upper)]
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                    (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))
df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for testing
X_test = df_snvs_testing_comb_i[feature_cols]
y_test = df_snvs_testing_comb_i[target_cols].tolist()

# Train XGB model with optimum model hyperparameters
eta_opt = 0.3
num_trees_opt = 150
gamma_opt = 1
max_depth_opt = 6
subsample_opt = 1
colsample_opt = 0.5
reg_lambda_opt = 1
reg_alpha_opt = 0

xgb_model = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=num_trees_opt, # number of trees, default = 100
                      eta=eta_opt, # this is learning rate, default = 0.3
                      max_depth=max_depth_opt, # maximum depth of the tree, default = 6
                      gamma = gamma_opt, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = reg_lambda_opt, # regularization parameter, defautl = 1
                      eval_metric = 'logloss',
                      colsample_bytree = colsample_opt,
                      alpha = reg_alpha_opt,
                      subsample = subsample_opt)
fpr_tpr = [] # Store all the fpr, tpr values for each model 
auc_scores = []
df_y_pred_all_models = pd.DataFrame()
df_y_scores_all_models = pd.DataFrame()
df_all_cv_scores = pd.DataFrame()
df_feature_importances_all_models = pd.DataFrame()
for model_i in list(train_sample.keys()):
    X_train_model_i = train_sample[model_i]['X_train']
    y_train_model_i = train_sample[model_i]['y_train']
#         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
    X_train_model_i = X_train_model_i.apply(pd.to_numeric)
    xgb_model.fit(X_train_model_i,y_train_model_i)

    # Predict using model_i on the test data
    y_pred = xgb_model.predict(X_test)
    df_y_pred_all_models[f"Model_{model_i}"] = y_pred
    y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
    df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
    false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
    fpr_tpr.append([false_positive,true_positive])
    # Calculate ROC only if both true and false labels are present.
    # Otherwise, the function roc_auc_score returns error.
    if len(set(y_test)) == 2:
        auc_test = roc_auc_score(y_test, y_pred)
    else:
        auc_test = math.nan
    auc_scores.append(auc_test)
    
    # Save the model with joblib
    joblib.dump(xgb_model, model_dir + "_".join(["xgb_model", str(model_i)]))  
# Calculate metrics only if y_test has both true and false iSNVs labels.
# Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
if len(set(y_test)) == 2:
    # Get the consensus predictions from all models
    y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

    # Calculate metrics on the consensus prediction
    auc_test = roc_auc_score(y_test, y_pred_consensus)

    # Calculate the F1 score
    F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

    # Calculate the mean-squared error
    mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

    # Calculate the accuracy
    accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

    # Calculate the MCC
    mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

    # Print the performance metrics on test data
    # Uncomment to see the metrics for each combination
    print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

    # Get the confusion matrix
    cm = confusion_matrix(y_test, y_pred_consensus)

    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
else:
    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
    auc_test = math.nan
    mcc_test = math.nan
    F1_score_test = math.nan
    accuracy_test = math.nan
    mse_test = math.nan

cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
        'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
        'F1_score', 'Accuracy', 'MSE']

# Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
df_tmp = pd.DataFrame(data=[[seed_i, true_var, false_var,af_lower,af_upper,feature_cat, 
                             tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                             accuracy_test, mse_test]], columns=cols)


testing samples = ['sample6', 'sample21', 'sample31', 'sample14', 'sample7', 'sample29']
Running predictions for all parameter/feature combinations with seed 252 ...	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3385
	Under-represented class = 1, count = 402
	Will proceed by creating balanced training data.
AUC = 0.873 , F1 score = 0.735 , Mean-squared error = 0.104 , Accuracy = 0.896 , Matthews correlation coefficient = 0.679


In [8]:
df_tmp

Unnamed: 0,Seed,True_Var_Def,False_Var_Def,VAF_Lower_Limit,VAF_Upper_Limit,Feature_Category,Num_True_Var_Testing,Num_False_Var_Testing,TP,FP,TN,FN,AUC,MCC,F1_score,Accuracy,MSE
0,252,[1],[0.33],0.01,0.5,Strict,43,207,36,19,188,7,0.872711,0.679,0.735,0.896,0.104


### Save models for VM strategy

In [10]:
model_dir = '../VM_models/'
Path(model_dir).mkdir(parents=True, exist_ok=True)

# The input vcf .csv file
snv_sbs_file = '../data/SNV_data_with_vcf_genie.csv'

af_upper = 0.5 # Updated to only consider a single, strict upper VAF limit of 50%.
                 # Any SNV with VAF > 50% is a major variant and will be excluded.
                 # We will only consider SNVs with VAF < 50%.
feature_cat = 'Moderate'
true_var = [1]
false_var = [0.33]

df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

# We will consider the SNVs in each replicate set only once and perform
# training and testing for each replicate set independently.
df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')


seed_i = 252 # The seed corresponding to the train/test iteration of hyperparameter tuning that
             # resulted in best performance.

# Randomly select 6 samples for testing 
# Get all the sample IDs
all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

# Set the random sampling seed
random.seed(seed_i)

# We will pick 6 testing samples 
testing_set_samples = random.sample(all_sample_ids, k=6)
print (f"testing samples = {testing_set_samples}")
df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_testing.reset_index(drop=True, inplace=True)
df_snvs_training.reset_index(drop=True, inplace=True)

print (f"Running predictions for all parameter/feature combinations with seed {seed_i} ...", end="", flush=True)
feature_cols = feat_cat_dict[feature_cat]

# Prepare training data for this parameters combination
df_snvs_training_comb_i = df_snvs_training.copy()
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['AF'] < af_upper)]
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                      (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for training
X = df_snvs_training_comb_i[feature_cols]
y = df_snvs_training_comb_i[target_cols].tolist()

# Create balanced datasets
train_sample, train_index_list = create_balanced_datasets(X,y)

# Prepare the testing dataset for the current paramters combination
df_snvs_testing_comb_i = df_snvs_testing.copy()
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['AF'] < af_upper)]
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                    (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))
df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for testing
X_test = df_snvs_testing_comb_i[feature_cols]
y_test = df_snvs_testing_comb_i[target_cols].tolist()

# Train XGB model with optimum model hyperparameters
eta_opt = 0.2
num_trees_opt = 50
gamma_opt = 0
max_depth_opt = 10
subsample_opt = 1
colsample_opt = 0.5
reg_lambda_opt = 2
reg_alpha_opt = 1

xgb_model = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=num_trees_opt, # number of trees, default = 100
                      eta=eta_opt, # this is learning rate, default = 0.3
                      max_depth=max_depth_opt, # maximum depth of the tree, default = 6
                      gamma = gamma_opt, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = reg_lambda_opt, # regularization parameter, defautl = 1
                      eval_metric = 'logloss',
                      colsample_bytree = colsample_opt,
                      alpha = reg_alpha_opt,
                      subsample = subsample_opt)
fpr_tpr = [] # Store all the fpr, tpr values for each model 
auc_scores = []
df_y_pred_all_models = pd.DataFrame()
df_y_scores_all_models = pd.DataFrame()
df_all_cv_scores = pd.DataFrame()
df_feature_importances_all_models = pd.DataFrame()
for model_i in list(train_sample.keys()):
    X_train_model_i = train_sample[model_i]['X_train']
    y_train_model_i = train_sample[model_i]['y_train']
#         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
    X_train_model_i = X_train_model_i.apply(pd.to_numeric)
    xgb_model.fit(X_train_model_i,y_train_model_i)

    # Predict using model_i on the test data
    y_pred = xgb_model.predict(X_test)
    df_y_pred_all_models[f"Model_{model_i}"] = y_pred
    y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
    df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
    false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
    fpr_tpr.append([false_positive,true_positive])
    # Calculate ROC only if both true and false labels are present.
    # Otherwise, the function roc_auc_score returns error.
    if len(set(y_test)) == 2:
        auc_test = roc_auc_score(y_test, y_pred)
    else:
        auc_test = math.nan
    auc_scores.append(auc_test)
    
    # Save the model with joblib
    joblib.dump(xgb_model, model_dir + "_".join(["xgb_model", str(model_i)]))  
# Calculate metrics only if y_test has both true and false iSNVs labels.
# Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
if len(set(y_test)) == 2:
    # Get the consensus predictions from all models
    y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

    # Calculate metrics on the consensus prediction
    auc_test = roc_auc_score(y_test, y_pred_consensus)

    # Calculate the F1 score
    F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

    # Calculate the mean-squared error
    mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

    # Calculate the accuracy
    accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

    # Calculate the MCC
    mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

    # Print the performance metrics on test data
    # Uncomment to see the metrics for each combination
    print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

    # Get the confusion matrix
    cm = confusion_matrix(y_test, y_pred_consensus)

    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
else:
    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
    auc_test = math.nan
    mcc_test = math.nan
    F1_score_test = math.nan
    accuracy_test = math.nan
    mse_test = math.nan

cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
        'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
        'F1_score', 'Accuracy', 'MSE']

# Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
df_tmp = pd.DataFrame(data=[[seed_i, true_var, false_var,af_lower,af_upper,feature_cat, 
                             tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                             accuracy_test, mse_test]], columns=cols)


testing samples = ['sample6', 'sample21', 'sample31', 'sample14', 'sample7', 'sample29']
Running predictions for all parameter/feature combinations with seed 252 ...	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3378
	Under-represented class = 1, count = 419
	Will proceed by creating balanced training data.
AUC = 0.865 , F1 score = 0.771 , Mean-squared error = 0.087 , Accuracy = 0.913 , Matthews correlation coefficient = 0.718


### Save models for FVM strategy

In [11]:
model_dir = '../FVM_models/'
Path(model_dir).mkdir(parents=True, exist_ok=True)

# The input vcf .csv file
snv_sbs_file = '../data/SNV_data_with_vcf_genie.csv'

af_lower = 0.01
af_upper = 0.5 # Updated to only consider a single, strict upper VAF limit of 50%.
                 # Any SNV with VAF > 50% is a major variant and will be excluded.
                 # We will only consider SNVs with VAF < 50%.
feature_cat = 'Strict'
true_var = [1]
false_var = [0.33]

df_snv_sbs = pd.read_csv(snv_sbs_file)

# Create a column to include the sample id (excluding the well id)
df_snv_sbs['sample_id'] = df_snv_sbs['sample'].apply(lambda x: x.split('_')[0])

# We will consider the SNVs in each replicate set only once and perform
# training and testing for each replicate set independently.
df_perf_out_xgb = pd.DataFrame()

# Remove variants with FAO = 0
fao_cutoff = 1
df_snv_sbs_fao_filtered = remove_low_coverage_samples(df_snv_sbs, fao_cutoff, 'FAO')


seed_i = 252 # The seed corresponding to the train/test iteration of hyperparameter tuning that
             # resulted in best performance.

# Randomly select 6 samples for testing 
# Get all the sample IDs
all_sample_ids = df_snv_sbs_fao_filtered['sample_id'].drop_duplicates().tolist()

# Set the random sampling seed
random.seed(seed_i)

# We will pick 6 testing samples 
testing_set_samples = random.sample(all_sample_ids, k=6)
print (f"testing samples = {testing_set_samples}")
df_snvs_testing = df_snv_sbs_fao_filtered.loc[df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_training = df_snv_sbs_fao_filtered.loc[~df_snv_sbs_fao_filtered['sample_id'].isin(testing_set_samples)]
df_snvs_testing.reset_index(drop=True, inplace=True)
df_snvs_training.reset_index(drop=True, inplace=True)

print (f"Running predictions for all parameter/feature combinations with seed {seed_i} ...", end="", flush=True)
feature_cols = feat_cat_dict[feature_cat]

# Prepare training data for this parameters combination
df_snvs_training_comb_i = df_snvs_training.copy()
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['AF'] > af_lower) & 
                                                      (df_snvs_training_comb_i['AF'] < af_upper)]
df_snvs_training_comb_i = df_snvs_training_comb_i.loc[(df_snvs_training_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                      (df_snvs_training_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_training_comb_i.reset_index(drop=True, inplace=True)
overlap_cat = list(df_snvs_training_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
df_snvs_training_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for training
X = df_snvs_training_comb_i[feature_cols]
y = df_snvs_training_comb_i[target_cols].tolist()

# Create balanced datasets
train_sample, train_index_list = create_balanced_datasets(X,y)

# Prepare the testing dataset for the current paramters combination
df_snvs_testing_comb_i = df_snvs_testing.copy()
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['AF'] > af_lower) & 
                                               (df_snvs_testing_comb_i['AF'] < af_upper)]
df_snvs_testing_comb_i = df_snvs_testing_comb_i.loc[(df_snvs_testing_comb_i['PERCENT_OVERLAP'] == true_var[0]) |
                                                    (df_snvs_testing_comb_i['PERCENT_OVERLAP'] == false_var[0])]

df_snvs_testing_comb_i.reset_index(drop=True, inplace=True)

# Create a binary response column to identify a variant as true variant (1) or false variant (0)
overlap_cat = list(df_snvs_testing_comb_i['PERCENT_OVERLAP'].apply(lambda x: 1 if x == true_var[0] else 0))
df_snvs_testing_comb_i['OVERLAP_CATEGORY'] = overlap_cat
target_cols = 'OVERLAP_CATEGORY' # Response column 

# Define X and y for testing
X_test = df_snvs_testing_comb_i[feature_cols]
y_test = df_snvs_testing_comb_i[target_cols].tolist()

# Train XGB model with optimum model hyperparameters
eta_opt = 0.2
num_trees_opt = 150
gamma_opt = 2
max_depth_opt = 10
subsample_opt = 1
colsample_opt = 0.5
reg_lambda_opt = 0
reg_alpha_opt = 0

xgb_model = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=num_trees_opt, # number of trees, default = 100
                      eta=eta_opt, # this is learning rate, default = 0.3
                      max_depth=max_depth_opt, # maximum depth of the tree, default = 6
                      gamma = gamma_opt, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = reg_lambda_opt, # regularization parameter, defautl = 1
                      eval_metric = 'logloss',
                      colsample_bytree = colsample_opt,
                      alpha = reg_alpha_opt,
                      subsample = subsample_opt)
fpr_tpr = [] # Store all the fpr, tpr values for each model 
auc_scores = []
df_y_pred_all_models = pd.DataFrame()
df_y_scores_all_models = pd.DataFrame()
df_all_cv_scores = pd.DataFrame()
df_feature_importances_all_models = pd.DataFrame()
for model_i in list(train_sample.keys()):
    X_train_model_i = train_sample[model_i]['X_train']
    y_train_model_i = train_sample[model_i]['y_train']
#         print (f"Running CV and predictions for model {model_i}...", end='', flush=True)
    X_train_model_i = X_train_model_i.apply(pd.to_numeric)
    xgb_model.fit(X_train_model_i,y_train_model_i)

    # Predict using model_i on the test data
    y_pred = xgb_model.predict(X_test)
    df_y_pred_all_models[f"Model_{model_i}"] = y_pred
    y_test_pred_scores = xgb_model.predict_proba(X_test)[:, 1]
    df_y_scores_all_models[f"Model_{model_i}"] = y_test_pred_scores
    false_positive, true_positive, _ = roc_curve(y_test, y_test_pred_scores)
    fpr_tpr.append([false_positive,true_positive])
    # Calculate ROC only if both true and false labels are present.
    # Otherwise, the function roc_auc_score returns error.
    if len(set(y_test)) == 2:
        auc_test = roc_auc_score(y_test, y_pred)
    else:
        auc_test = math.nan
    auc_scores.append(auc_test)
    
    # Save the model with joblib
    joblib.dump(xgb_model, model_dir + "_".join(["xgb_model", str(model_i)]))  
# Calculate metrics only if y_test has both true and false iSNVs labels.
# Otherwise, the AUC cannot be calculated as the function roc_auc_score returns error.
if len(set(y_test)) == 2:
    # Get the consensus predictions from all models
    y_pred_consensus = get_ensemble_prediction(df_y_scores_all_models)

    # Calculate metrics on the consensus prediction
    auc_test = roc_auc_score(y_test, y_pred_consensus)

    # Calculate the F1 score
    F1_score_test = round(f1_score(y_test, y_pred_consensus), 3)

    # Calculate the mean-squared error
    mse_test = round(mean_squared_error(y_test, y_pred_consensus), 3)

    # Calculate the accuracy
    accuracy_test = round(accuracy_score(y_test, y_pred_consensus), 3)

    # Calculate the MCC
    mcc_test = round(matthews_corrcoef(y_test, y_pred_consensus), 3)

    # Print the performance metrics on test data
    # Uncomment to see the metrics for each combination
    print ("AUC =", round(auc_test, 3), ", F1 score =", F1_score_test, ", Mean-squared error =", mse_test, ", Accuracy =", accuracy_test, ", Matthews correlation coefficient =", mcc_test)

    # Get the confusion matrix
    cm = confusion_matrix(y_test, y_pred_consensus)

    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
else:
    # Get the indices of false positives
    false_positive_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true positives
    true_positive_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 1)[0])))
    # Get the indices of true negatives
    true_negative_ind = list(set(np.where(np.array(y_test) == 0)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))
    # Get the indices of false negatives
    false_negative_ind = list(set(np.where(np.array(y_test) == 1)[0]).intersection(set(np.where(np.array(y_pred_consensus) == 0)[0])))

    fp = len(false_positive_ind)
    tp = len(true_positive_ind)
    tn = len(true_negative_ind)
    fn = len(false_negative_ind)
    auc_test = math.nan
    mcc_test = math.nan
    F1_score_test = math.nan
    accuracy_test = math.nan
    mse_test = math.nan

cols = ['Seed', 'True_Var_Def', 'False_Var_Def', 'VAF_Lower_Limit', 'VAF_Upper_Limit',  'Feature_Category',
        'Num_True_Var_Testing', 'Num_False_Var_Testing', 'TP', 'FP', 'TN', 'FN', 'AUC', 'MCC', 
        'F1_score', 'Accuracy', 'MSE']

# Add an additional space before the true var def and false var def. Otherwise, excel formats it as date.
df_tmp = pd.DataFrame(data=[[seed_i, true_var, false_var,af_lower,af_upper,feature_cat, 
                             tp+fn, tn+fp, tp, fp, tn, fn, auc_test, mcc_test, F1_score_test,
                             accuracy_test, mse_test]], columns=cols)


testing samples = ['sample6', 'sample21', 'sample31', 'sample14', 'sample7', 'sample29']
Running predictions for all parameter/feature combinations with seed 252 ...	Train-test split resulted in imbalanced training data
	Over-represented class = 0, count = 3377
	Under-represented class = 1, count = 402
	Will proceed by creating balanced training data.
AUC = 0.847 , F1 score = 0.68 , Mean-squared error = 0.132 , Accuracy = 0.868 , Matthews correlation coefficient = 0.613
