In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import xgboost as xgb

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_AGTTC.csv')

# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        results[metric_name][i] = metric_fn(y_test, y_pred, average='binary')
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


ValueError: average has to be one of (None, 'micro', 'macro', 'weighted', 'samples')

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import xgboost as xgb

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_AGTTC.csv')

In [7]:
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)


In [8]:
# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        results[metric_name][i] = metric_fn(y_test, y_pred, average='binary')
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


ValueError: average has to be one of (None, 'micro', 'macro', 'weighted', 'samples')

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
import xgboost as xgb

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_AGTTC.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    #X_train, y_train = data.iloc[train_idx, :-1], data.iloc[train_idx, -1]
    #X_test, y_test = data.iloc[test_idx, :-1], data.iloc[test_idx, -1]
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred, average='macro')
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


TypeError: accuracy_score() got an unexpected keyword argument 'average'

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
import xgboost as xgb

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_AGTTC.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


AUC: 0.9567 +/- 0.0070
Accuracy: 0.9852 +/- 0.0025
Precision: 0.9544 +/- 0.0339
Recall: 0.9567 +/- 0.0070
AUPRC: 0.9810 +/- 0.0074
F1 score: 0.9358 +/- 0.0143
MCC: 0.9280 +/- 0.0158
