In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import re

In [2]:
def smote(X,y):
    smote = SMOTE(random_state= 123)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [3]:
data = pd.read_csv("C:/Users/Li Xiaokang/Desktop/null_importance_result/win31_RF_filtered_0.05_train_PCC_1.csv")
data_val = data.values
X = data_val[:,1:]
y = data_val[:,0]

In [4]:
def xgb_kfold(input_dir, output_path, k_fold_cv, n_estimators, gamma, subsample, is_sample_aug):
    
    input_dir = input_dir
    output_path = output_path

    data = pd.read_csv(input_dir)

    smote_result_dict = {}

    smote_result_dict['method'] = 'XGB'+'_'+str(int(k_fold_cv))+'_kfold'+'_smote_'+name_result

    skf = StratifiedKFold(n_splits = k_fold_cv, random_state = 123, shuffle = True)

    data_val = data.values
    X = data_val[:,1:]
    y = data_val[:,0]

    smote_xgb_model = xgb.XGBClassifier(learning_rate = 0.01, 
                                                n_estimators = n_estimators, 
                                                gamma = gamma, 
                                                subsample = subsample,
                                                random_state = 123)


    y_test_array = np.array([])
    y_pred_array = np.array([])
    y_pred_proba_1 = np.array([])

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        smote_x, smote_y = smote(X_train,y_train)

        smote_xgb_model.fit(smote_x, smote_y)

        smote_y_pred = smote_xgb_model.fit(smote_x, smote_y).predict(X_test)
        
        smote_y_pred_proba_1 = smote_xgb_model.predict_proba(X_test)[:,1]
        
        y_test_array = np.concatenate((y_test_array, y_test))
        y_pred_array = np.concatenate((y_pred_array, smote_y_pred))
        y_pred_proba_1 = np.concatenate((y_pred_proba_1, smote_y_pred_proba_1))

    smote_result_dict['ACC'] = accuracy_score(y_test_array, y_pred_array)

    smote_result_dict['MCC'] = matthews_corrcoef(y_test_array, y_pred_array)
    
    smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test_array, y_pred_array).ravel()

    smote_result_dict['Sensitivity'] = smote_tp / (smote_tp + smote_fn)

    smote_result_dict['Specificity'] = smote_tn / (smote_tn + smote_fp)

    smote_result_dict['AUC'] = roc_auc_score(y_test_array, y_pred_proba_1)

    original_data = pd.read_excel(output_path)

    smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

    smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

    smote_save_result.to_excel(output_path, index=False)

In [9]:
xgb_kfold("C:/Users/Li Xiaokang/Desktop/null_importance_result/win31_RF_filtered_0.05_train_PCC_1.csv", 
         "C:/Users/Li Xiaokang/Desktop/win31_kfold_result.xlsx",
         5, 150, 0.4, 0.9, 1)

In [7]:
def xgb_independent_test(train_path, test_path, output_path, is_sample_aug, n_estimators, gamma, subsample):
    pattern = r'PCC_(.*?)\.csv'

    match = re.search(pattern, test_path)
    if match:
        name_result = match.group(1)
    
    if is_sample_aug:
        output_path = output_path

        smote_result_dict = {}

        smote_result_dict['method'] = 'win31_XGB_'+'smote'

        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        train_val = train.values
        test_val = test.values

        X_train = train_val[:,1:]
        X_test = test_val[:,1:]
        y_train = train_val[:,0]
        y_test = test_val[:,0]

        smote_x, smote_y = smote(X_train,y_train)

        smote_xgb_model = xgb.XGBClassifier(learning_rate = 0.01, 
                                                    n_estimators = n_estimators, 
                                                    gamma = gamma, 
                                                    subsample = subsample,
                                                    random_state = 123)

        smote_xgb_model.fit(smote_x, smote_y)

        smote_y_pred = smote_xgb_model.predict(X_test)

        smote_accuracy = accuracy_score(y_test, smote_y_pred)

        smote_result_dict['ACC'] = smote_accuracy

        smote_mcc = matthews_corrcoef(y_test, smote_y_pred)


        smote_result_dict['MCC'] = smote_mcc

        smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test, smote_y_pred).ravel()

        smote_sensitivity = smote_tp / (smote_tp + smote_fn)

        smote_specificity = smote_tn / (smote_tn + smote_fp)

        smote_result_dict['Sensitivity'] = smote_sensitivity

        smote_result_dict['Specificity'] = smote_specificity

        smote_y_pred_proba = smote_xgb_model.predict_proba(X_test)

        smote_auc = roc_auc_score(y_test, smote_y_pred_proba[:,1])

        smote_result_dict['AUC'] = smote_auc

        original_data = pd.read_excel(output_path)

        smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

        smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

        smote_save_result.to_excel(output_path, index=False)
    else:
        output_path = output_path
        result_dict = {}
        result_dict['method'] = 'RF_imb'

        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)


        train_val = train.values
        test_val = test.values

        X_train = train_val[:,1:]
        X_test = test_val[:,1:]
        y_train = train_val[:,0]
        y_test = test_val[:,0]

        # Define your random forest model
        rf_model = xgb.XGBClassifier(n_estimators=n_estimators, random_state=123)

        # Train the model on the entire training set
        rf_model.fit(X_train, y_train)

        # Evaluate the model on the test set
        y_pred = rf_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        result_dict['ACC'] = accuracy

        # calculate the MCC of the model
        mcc = matthews_corrcoef(y_test, y_pred)

        result_dict['MCC'] = mcc

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)

        result_dict['Sensitivity'] = sensitivity
        result_dict['Specificity'] = specificity

        y_pred_proba = rf_model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_pred_proba[:,1])
        result_dict['AUC'] = auc

        original_data = pd.read_excel(output_path)
        result_df = pd.DataFrame(result_dict, index=[0])
        save_result = pd.concat([original_data, result_df], axis=0)
        save_result.to_excel(output_path, index=False)

In [10]:
xgb_independent_test("C:/Users/Li Xiaokang/Desktop/null_importance_result/win31_RF_filtered_0.05_train_PCC_1.csv",
                    "C:/Users/Li Xiaokang/Desktop/null_importance_result/test31_RF_filtered_0.05_null_importances_1.csv",
                    "C:/Users/Li Xiaokang/Desktop/win31_independent_test_result.xlsx" ,
                    1, 150, 0.4, 0.9)