In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
import xgboost as xgb

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_TGTGC.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


AUC: 0.8113 +/- 0.0156
Accuracy: 0.9338 +/- 0.0053
Precision: 0.7472 +/- 0.0314
Recall: 0.8113 +/- 0.0156
AUPRC: 0.7746 +/- 0.0252
F1 score: 0.6960 +/- 0.0278
MCC: 0.6611 +/- 0.0305


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
#import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_TGTGC.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


AUC: 0.7689 +/- 0.0101
Accuracy: 0.9298 +/- 0.0030
Precision: 0.7756 +/- 0.0249
Recall: 0.7689 +/- 0.0101
AUPRC: 0.7478 +/- 0.0242
F1 score: 0.6494 +/- 0.0156
MCC: 0.6219 +/- 0.0159


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
#import xgboost as xgb
from sklearn.svm import SVC

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_TGTGC.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = SVC(probability=True)
    model.fit(X_train, y_train)
    
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


  _warn_prf(average, modifier, msg_start, len(result))


TypeError: average_precision_score() got an unexpected keyword argument 'zero_division'

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
#from sklearn.svm import SVC
#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_TGTGC.csv')

# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='binary', zero_division=1),
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train a SVM model on the training data for this fold
   # model = SVC(probability=True)
    #model=LogisticRegression(random_state=42)
    model=KNeighborsClassifier()
    model.fit(X_train, y_train)
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            if np.sum(y_pred) == 0:  # If there are no positive predictions, return the default value
                results[metric_name][i] = 0.0
            else:
                results[metric_name][i] = metric_fn(y_test, y_pred)
                
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


AUC: 0.5075 +/- 0.0054
Accuracy: 0.8740 +/- 0.0030
Precision: 0.2097 +/- 0.0729
Recall: 0.5075 +/- 0.0054
AUPRC: 0.1307 +/- 0.0114
F1 score: 0.0519 +/- 0.0181
MCC: 0.0376 +/- 0.0275


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
#import xgboost as xgb
from sklearn.svm import SVC

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/all_GATGA.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = SVC(probability=True)
    model.fit(X_train, y_train)
    
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')


AUC: 0.5196 +/- 0.0139
Accuracy: 0.9033 +/- 0.0154
Precision: 0.7667 +/- 0.2906
Recall: 0.5196 +/- 0.0139
AUPRC: 0.3400 +/- 0.0568
F1 score: 0.0763 +/- 0.0485
MCC: 0.1576 +/- 0.0848


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, average_precision_score
#import xgboost as xgb
from sklearn.svm import SVC

# Load your data into a pandas DataFrame, assuming that the target variable is in the 'target' column
#data = pd.read_csv('your_data.csv')
data = pd.read_csv('/home/jiayi/5moU/data/DL_fromTombo/AGTTC_0.csv')
# Define the number of folds for cross-validation
n_folds = 5

# Define the evaluation metrics to use
metrics = {
    'AUC': roc_auc_score,
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='macro'),
    'AUPRC': average_precision_score,
    'F1 score': f1_score,
    'MCC': matthews_corrcoef
}

# Initialize arrays to store the evaluation metrics for each fold
results = {}
for metric in metrics:
    results[metric] = np.zeros(n_folds)

# Define the K-fold cross-validation object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Loop over the folds
for i, (train_idx, test_idx) in enumerate(kf.split(data)):
    # Split the data into training and testing sets for this fold
    X_train, y_train = data.iloc[train_idx, 2:-1], data.iloc[train_idx, -1]
    X_test, y_test = data.iloc[test_idx, 2:-1], data.iloc[test_idx, -1]
    
    # Train an XGBoost model on the training data for this fold
    model = SVC(probability=True)
    model.fit(X_train, y_train)
    
    
    # Predict the probabilities and labels for the testing data
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate the evaluation metrics for this fold
    for metric_name, metric_fn in metrics.items():
        if metric_name == 'AUPRC':
            results[metric_name][i] = metric_fn(y_test, y_prob)
        else:
            results[metric_name][i] = metric_fn(y_test, y_pred)
    
# Print the mean and standard deviation of each evaluation metric across all folds
for metric_name in metrics:
    print(f'{metric_name}: {results[metric_name].mean():.4f} +/- {results[metric_name].std():.4f}')
