In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [2]:
def smote(X,y):
    smote = SMOTE(random_state= 123)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [5]:
train_data = np.loadtxt("../ubiquitination/null_importances_result/train_test/train_pca.txt")
test_data = np.loadtxt("../ubiquitination/null_importances_result/train_test/test_pca.txt")
train_X = train_data[:,1:]
train_y = train_data[:,0]
test_X = test_data[:,1:]
test_y = test_data[:,0]

In [8]:
def dt_kfold(X, y, output_path, k_fold_cv, max_depth):
    
    output_path = output_path

    smote_result_dict = {}

    smote_result_dict['method'] = 'DT'+'_'+str(int(k_fold_cv))+'_kfold'+'_smote'

    skf = StratifiedKFold(n_splits = k_fold_cv, random_state = 123, shuffle = True)

    smote_dt_model = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 123)

    y_test_array = np.array([])
    y_pred_array = np.array([])
    y_pred_proba_1 = np.array([])

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        smote_x, smote_y = smote(X_train,y_train)

        smote_dt_model.fit(smote_x, smote_y)

        smote_y_pred = smote_dt_model.predict(X_test)
        
        smote_y_pred_proba_1 = smote_dt_model.predict_proba(X_test)[:,1]
        
        y_test_array = np.concatenate((y_test_array, y_test))
        y_pred_array = np.concatenate((y_pred_array, smote_y_pred))
        y_pred_proba_1 = np.concatenate((y_pred_proba_1, smote_y_pred_proba_1))

    smote_result_dict['ACC'] = accuracy_score(y_test_array, y_pred_array)

    smote_result_dict['MCC'] = matthews_corrcoef(y_test_array, y_pred_array)

    smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test_array, y_pred_array).ravel()

    smote_result_dict['Sensitivity'] = smote_tp / (smote_tp + smote_fn)

    smote_result_dict['Specificity'] = smote_tn / (smote_tn + smote_fp)

    smote_result_dict['AUC'] = roc_auc_score(y_test_array, y_pred_proba_1)

    original_data = pd.read_excel(output_path)

    smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

    smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

    smote_save_result.to_excel(output_path, index=False)

In [10]:
dt_kfold(train_X, train_y, "C:/Users/Li Xiaokang/Desktop/kfold_result.xlsx", 5, 3)

In [11]:
def dt_independent_test(X_train, y_train, X_test, y_test, output_path, max_depth):

    output_path = output_path

    smote_result_dict = {}

    smote_result_dict['method'] = 'win31_DT_'+'smote'

    smote_x, smote_y = smote(X_train,y_train)

    smote_dt_model = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=123)

    smote_dt_model.fit(smote_x, smote_y)

    smote_y_pred = smote_dt_model.predict(X_test)

    smote_accuracy = accuracy_score(y_test, smote_y_pred)

    smote_result_dict['ACC'] = smote_accuracy

    smote_mcc = matthews_corrcoef(y_test, smote_y_pred)

    smote_result_dict['MCC'] = smote_mcc

    smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test, smote_y_pred).ravel()

    smote_sensitivity = smote_tp / (smote_tp + smote_fn)

    smote_specificity = smote_tn / (smote_tn + smote_fp)

    smote_result_dict['Sensitivity'] = smote_sensitivity

    smote_result_dict['Specificity'] = smote_specificity

    smote_y_pred_proba = smote_dt_model.predict_proba(X_test)

    smote_auc = roc_auc_score(y_test, smote_y_pred_proba[:,1])

    smote_result_dict['AUC'] = smote_auc

    original_data = pd.read_excel(output_path)

    smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

    smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

    smote_save_result.to_excel(output_path, index=False)

In [12]:
dt_independent_test(train_X, train_y, test_X, test_y, "C:/Users/Li Xiaokang/Desktop/independent_test_result.xlsx" , 3)