In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import re

In [2]:
def smote(X,y):
    smote = SMOTE(random_state= 123)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [14]:
def lr_kfold(input_dir, output_path, k_fold_cv):
    
    pattern = r'filtered_(.*?)\.csv'

    match = re.search(pattern, input_dir)
    if match:
        name_result = match.group(1)
    
    input_dir = input_dir
    output_path = output_path

    data = pd.read_csv(input_dir)

    smote_result_dict = {}

    smote_result_dict['method'] = 'LR'+'_'+str(int(k_fold_cv))+'_fold'+'_smote_'+name_result

    skf = StratifiedKFold(n_splits = k_fold_cv, random_state = 123, shuffle = True)

    data_val = data.values
    X = data_val[:,1:]
    y = data_val[:,0]

    smote_lr_model = linear_model.LogisticRegression(max_iter = 100000, n_jobs = 4, random_state = 123)

    smote_metrics_result = np.zeros((k_fold_cv,5))

    i = 0

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        smote_x, smote_y = smote(X_train,y_train)

        smote_lr_model.fit(smote_x, smote_y)

        smote_y_pred = smote_lr_model.fit(smote_x, smote_y).predict(X_test)

        smote_metrics_result[i,0] = accuracy_score(y_test, smote_y_pred)

        smote_metrics_result[i,1] = matthews_corrcoef(y_test, smote_y_pred)

        smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test, smote_y_pred).ravel()

        smote_metrics_result[i,2] = smote_tp / (smote_tp + smote_fn)

        smote_metrics_result[i,3] = smote_tn / (smote_tn + smote_fp)

        smote_y_pred_proba = smote_lr_model.predict_proba(X_test)

        smote_metrics_result[i,4] = roc_auc_score(y_test, smote_y_pred_proba[:,1])

        i += 1

    smote_metrics_results_mean = smote_metrics_result.mean(axis=0)

    smote_result_dict['ACC'] = smote_metrics_results_mean[0]

    smote_result_dict['MCC'] = smote_metrics_results_mean[1]

    smote_result_dict['Sensitivity'] = smote_metrics_results_mean[2]

    smote_result_dict['Specificity'] = smote_metrics_results_mean[3]

    smote_result_dict['AUC'] = smote_metrics_results_mean[4]

    original_data = pd.read_excel(output_path)

    smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

    smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

    smote_save_result.to_excel(output_path, index=False)

In [17]:
lr_kfold("C:/Users/Li Xiaokang/Desktop/null_importance_result/win31_RF_filtered_0.05_train_PCC_1.csv", 
         "C:/Users/Li Xiaokang/Desktop/win31_kfold_result.xlsx",
         5)

In [4]:
def lr_independent_test(train_path, test_path, output_path):
    
    output_path = output_path
    smote_result_dict = {}

    smote_result_dict['method'] = 'win31_LR_'+'smote'

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train_val = train.values
    test_val = test.values

    X_train = train_val[:,1:]
    X_test = test_val[:,1:]
    y_train = train_val[:,0]
    y_test = test_val[:,0]

    smote_x, smote_y = smote(X_train,y_train)

    smote_lr_model = linear_model.LogisticRegression(max_iter = 100000, n_jobs = 4, random_state = 123)

    smote_lr_model.fit(smote_x, smote_y)

    smote_y_pred = smote_lr_model.predict(X_test)

    smote_accuracy = accuracy_score(y_test, smote_y_pred)

    smote_result_dict['ACC'] = smote_accuracy

    smote_mcc = matthews_corrcoef(y_test, smote_y_pred)

    smote_result_dict['MCC'] = smote_mcc

    smote_tn, smote_fp, smote_fn, smote_tp = confusion_matrix(y_test, smote_y_pred).ravel()

    smote_sensitivity = smote_tp / (smote_tp + smote_fn)

    smote_specificity = smote_tn / (smote_tn + smote_fp)

    smote_result_dict['Sensitivity'] = smote_sensitivity

    smote_result_dict['Specificity'] = smote_specificity

    smote_y_pred_proba = smote_lr_model.predict_proba(X_test)

    smote_auc = roc_auc_score(y_test, smote_y_pred_proba[:,1])

    smote_result_dict['AUC'] = smote_auc

    original_data = pd.read_excel(output_path)

    smote_result_df = pd.DataFrame(smote_result_dict, index=[0])

    smote_save_result = pd.concat([original_data, smote_result_df], axis=0)

    smote_save_result.to_excel(output_path, index=False)

In [6]:
lr_independent_test("C:/Users/Li Xiaokang/Desktop/null_importance_result/win31_RF_filtered_0.05_train_PCC_1.csv",
                    "C:/Users/Li Xiaokang/Desktop/null_importance_result/test31_RF_filtered_0.05_null_importances_1.csv",
                    "C:/Users/Li Xiaokang/Desktop/win31_independent_test_result.xlsx")