In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import re

In [2]:
def smote(X,y):
    smote = SMOTE(random_state= 123)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [3]:
def rf_kfold(input_dir, output_path, k_fold_cv, is_sample_aug, n_estimators):
    
    pattern = r'filtered_(.*?)\.csv'

    match = re.search(pattern, input_dir)
    if match:
        name_result = match.group(1)
    
    if is_sample_aug:
        input_dir = input_dir
        output_path = output_path

        data = pd.read_csv(input_dir)
        
        smote_result_dict = {}

        smote_result_dict['method'] = 'RF'+'_'+str(int(k_fold_cv))+'_fold'+'_smote_'+name_result

        skf = StratifiedKFold(n_splits = k_fold_cv, random_state = 123, shuffle = True)

        data_val = data.values
        X = data_val[:,1:]
        y = data_val[:,0]

        smote_rf_model = RandomForestClassifier(n_estimators = n_estimators, n_jobs = 4, random_state = 123)

        y_test_array = np.array([])
        y_pred_array = np.array([])
        y_pred_proba_array = np.array([])
        
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            smote_x, smote_y = smote(X_train,y_train)
            smote_rf_model.fit(smote_x, smote_y)
            smote_y_pred = smote_rf_model.predict(X_test)
            smote_y_pred_proba_1 = smote_rf_model.predict_proba(X_test)[:,1]
            
            y_test_array = np.concatenate((y_test_array, y_test))
            y_pred_array = np.concatenate((y_pred_array, smote_y_pred))
            y_pred_proba_array = np.concatenate((y_pred_proba_array, smote_y_pred_proba_1))

        smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test_array, y_pred_array).ravel()


        smote_result_dict['ACC'] = accuracy_score(y_test_array, y_pred_array)

        smote_result_dict['MCC'] = matthews_corrcoef(y_test_array, y_pred_array)

        smote_result_dict['Sensitivity'] = smote_tp / (smote_tp + smote_fn)

        smote_result_dict['Specificity'] = smote_tn / (smote_tn + smote_fp)

        smote_result_dict['AUC'] = roc_auc_score(y_test_array, y_pred_proba_array)

        original_data = pd.read_excel(output_path)

        smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

        smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

        smote_save_result.to_excel(output_path, index=False)
            
    else:
        input_dir = input_dir
        output_path = output_path
        data = pd.read_csv(input_dir)
        result_dict = {}
        result_dict['method'] = 'RF'+'_'+str(int(k_fold_cv))+'_fold_'+name_result
        kf = KFold(n_splits = k_fold_cv, random_state = 123, shuffle = True)

        data_val = data.values

        X = data_val[:,1:]
        y = data_val[:,0]

        rf_model = RandomForestClassifier(n_estimators = n_estimators, n_jobs = 4, random_state = 123)
        y_test_array = np.array([])
        y_pred_array = np.array([])
        y_pred_proba_array = np.array([])

        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            y_pred = rf_model.fit(X_train, y_train).predict(X_test)
            y_pred_proba_1 = rf_model.predict_proba(X_test)[:,1]
            
            y_test_array = np.concatenate((y_test_array, y_test))
            y_pred_array = np.concatenate((y_pred_array, y_pred))
            y_pred_proba_array = np.concatenate((y_pred_proba_array, y_pred_proba_1))

        tn, fp, fn, tp = confusion_matrix(y_test_array, y_pred_array).ravel()

        result_dict['ACC'] = accuracy_score(y_test_array, y_pred_array)

        result_dict['MCC'] = matthews_corrcoef(y_test_array, y_pred_array)

        result_dict['Sensitivity'] = tp / (tp + fn)
        result_dict['Specificity'] = tn / (tn + fp)


        result_dict['AUC'] = roc_auc_score(y_test_array, y_pred_proba_array)

        original_data = pd.read_excel(output_path)
        result_df = pd.DataFrame(result_dict, index=[0])
        save_result = pd.concat([original_data, result_df], axis=0)
        save_result.to_excel(output_path, index=False)

In [19]:
rf_kfold("C:/Users/Li Xiaokang/Desktop/null_importance_result/train31_RF_filtered_null_importances_0.09.csv", 
         "C:/Users/Li Xiaokang/Desktop/win31_kfold_result.xlsx",
         5, 1, 140)

In [4]:
def rf_independent_test(train_path, test_path, output_path, is_sample_aug, n_estimators):
    
    if is_sample_aug:
        output_path = output_path
        smote_result_dict = {}

        smote_result_dict['method'] = 'win31_RF_'+'smote'

        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        train_val = train.values
        test_val = test.values

        X_train = train_val[:,1:]
        X_test = test_val[:,1:]
        y_train = train_val[:,0]
        y_test = test_val[:,0]

        smote_x, smote_y = smote(X_train,y_train)

        smote_rf_model = RandomForestClassifier(n_estimators=n_estimators, n_jobs = 4, random_state=123)

        smote_rf_model.fit(smote_x, smote_y)

        smote_y_pred = smote_rf_model.predict(X_test)

        smote_accuracy = accuracy_score(y_test, smote_y_pred)

        smote_result_dict['ACC'] = smote_accuracy

        smote_mcc = matthews_corrcoef(y_test, smote_y_pred)

        smote_result_dict['MCC'] = smote_mcc

        smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test, smote_y_pred).ravel()

        smote_sensitivity = smote_tp / (smote_tp + smote_fn)

        smote_specificity = smote_tn / (smote_tn + smote_fp)

        smote_result_dict['Sensitivity'] = smote_sensitivity

        smote_result_dict['Specificity'] = smote_specificity

        smote_y_pred_proba = smote_rf_model.predict_proba(X_test)

        smote_auc = roc_auc_score(y_test, smote_y_pred_proba[:,1])

        smote_result_dict['AUC'] = smote_auc

        original_data = pd.read_excel(output_path)

        smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

        smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

        smote_save_result.to_excel(output_path, index=False)
    else:
        output_path = output_path
        result_dict = {}
        result_dict['method'] = 'RF_imb'

        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)


        train_val = train.values
        test_val = test.values

        X_train = train_val[:,1:]
        X_test = test_val[:,1:]
        y_train = train_val[:,0]
        y_test = test_val[:,0]

        rf_model = RandomForestClassifier(n_estimators = n_estimators, n_jobs = 4, random_state=123)

        rf_model.fit(X_train, y_train)

        y_pred = rf_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        result_dict['ACC'] = accuracy

        mcc = matthews_corrcoef(y_test, y_pred)

        result_dict['MCC'] = mcc

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)

        result_dict['Sensitivity'] = sensitivity
        result_dict['Specificity'] = specificity

        y_pred_proba = rf_model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_pred_proba[:,1])
        result_dict['AUC'] = auc

        original_data = pd.read_excel(output_path)
        result_df = pd.DataFrame(result_dict, index=[0])
        save_result = pd.concat([original_data, result_df], axis=0)
        save_result.to_excel(output_path, index=False)

In [5]:
rf_independent_test("C:/Users/Li Xiaokang/Desktop/null_importances_result/train_test/train_human_0.05.csv",
                    "C:/Users/Li Xiaokang/Desktop/null_importances_result/train_test/test_human.csv",
                    "C:/Users/Li Xiaokang/Desktop/independent_test_result.xlsx" ,
                    0, 140)