In [None]:
# Random Forest

# Step 1: Import the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score

# Step 2: Load your data and split it into features (X) and labels (y)
pos1 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_encoding_NCP_ND.csv') 
neg1 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_encoding_NCP_ND.csv')

pos2 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
neg2 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')

pos = pd.concat([pos1,pos2],axis = 1)
neg = pd.concat([neg1,neg2],axis = 1)

pos = pos.dropna()
neg = neg.dropna()

raw_datas = np.concatenate((pos,neg),axis = 0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]),axis = 0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])

X = raw_datas[indices,:]
y = raw_labels[indices]

# Replace X_train, y_train, X_test, y_test with your actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create the Random Forest model
rf_model = RandomForestClassifier(n_estimators=1, random_state=42)

# Step 4: Perform five-fold cross-validation and get predictions for validation set
predictions_val = cross_val_predict(rf_model, X_train, y_train, cv=5, method='predict_proba')[:, 1]

# Step 5: Calculate evaluation metrics for the validation set
AUC_val = roc_auc_score(y_train, predictions_val)
ACC_val = accuracy_score(y_train, (predictions_val > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_train, (predictions_val > 0.5).astype(int))
Sn_val = recall_score(y_train, (predictions_val > 0.5).astype(int))
Sp_val = precision_score(y_train, (predictions_val > 0.5).astype(int))

# Now, fit the model on the full training set
rf_model.fit(X_train, y_train)

# Step 6: Get predictions for the test set
predictions_test = rf_model.predict_proba(X_test)[:, 1]

# Step 7: Calculate evaluation metrics for the test set
AUC_test = roc_auc_score(y_test, predictions_test)
ACC_test = accuracy_score(y_test, (predictions_test > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (predictions_test > 0.5).astype(int))
Sn_test = recall_score(y_test, (predictions_test > 0.5).astype(int))
Sp_test = precision_score(y_test, (predictions_test > 0.5).astype(int))

# Step 8: Print the results
print("Validation Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))

print("Test Metrics:")
print("AUC_test: {:.4f}".format(AUC_test))
print("ACC_test: {:.4f}".format(ACC_test))
print("MCC_test: {:.4f}".format(MCC_test))
print("Sn_test: {:.4f}".format(Sn_test))
print("Sp_test: {:.4f}".format(Sp_test))

In [None]:
# Naive Bayes
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score
from sklearn.preprocessing import StandardScaler

# Step 2: Load your data and split it into features (X) and labels (y)

pos1 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_encoding_NCP_ND.csv') 
neg1 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_encoding_NCP_ND.csv')

pos2 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
neg2 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')

pos = pd.concat([pos1,pos2],axis = 1)
neg = pd.concat([neg1,neg2],axis = 1)

pos = pos.dropna()
neg = neg.dropna()

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])

X = raw_datas[indices, :]
y = raw_labels[indices]

# Replace X_train, y_train, X_test, y_test with your actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Create the Naive Bayes model
model = GaussianNB()

# Step 5: Train the model
model.fit(X_train_scaled, y_train)

# Step 6: Get predictions for the validation set
predictions_val = model.predict_proba(X_train_scaled)[:, 1]

# Step 7: Calculate evaluation metrics for the validation set
AUC_val = roc_auc_score(y_train, predictions_val)
ACC_val = accuracy_score(y_train, (predictions_val > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_train, (predictions_val > 0.5).astype(int))
Sn_val = recall_score(y_train, (predictions_val > 0.5).astype(int))
Sp_val = precision_score(y_train, (predictions_val > 0.5).astype(int))

# Step 8: Get predictions for the test set
predictions_test = model.predict_proba(X_test_scaled)[:, 1]

# Step 9: Calculate evaluation metrics for the test set
AUC_test = roc_auc_score(y_test, predictions_test)
ACC_test = accuracy_score(y_test, (predictions_test > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (predictions_test > 0.5).astype(int))
Sn_test = recall_score(y_test, (predictions_test > 0.5).astype(int))
Sp_test = precision_score(y_test, (predictions_test > 0.5).astype(int))

# Step 10: Print the results
print("Validation Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))

print("Test Metrics:")
print("AUC_test: {:.4f}".format(AUC_test))
print("ACC_test: {:.4f}".format(ACC_test))
print("MCC_test: {:.4f}".format(MCC_test))
print("Sn_test: {:.4f}".format(Sn_test))
print("Sp_test: {:.4f}".format(Sp_test))

In [None]:
# Logistic Regresssion
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score
from sklearn.preprocessing import StandardScaler  # Add this import for scaling the data

# Step 2: Load your data and split it into features (X) and labels (y)

pos1 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_encoding_NCP_ND.csv') 
neg1 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_encoding_NCP_ND.csv')

pos2 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
neg2 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')

pos = pd.concat([pos1,pos2],axis = 1)
neg = pd.concat([neg1,neg2],axis = 1)

pos = pos.dropna()
neg = neg.dropna()

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])

X = raw_datas[indices, :]
y = raw_labels[indices]

# Replace X_train, y_train, X_test, y_test with your actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Create the Logistic Regression model and set max_iter
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Step 5: Perform five-fold cross-validation and get predictions for validation set
predictions_val = cross_val_predict(lr_model, X_train_scaled, y_train, cv=5, method='predict_proba')[:, 1]

# Step 6: Calculate evaluation metrics for the validation set
AUC_val = roc_auc_score(y_train, predictions_val)
ACC_val = accuracy_score(y_train, (predictions_val > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_train, (predictions_val > 0.5).astype(int))
Sn_val = recall_score(y_train, (predictions_val > 0.5).astype(int))
Sp_val = precision_score(y_train, (predictions_val > 0.5).astype(int))

# Now, fit the model on the full training set
lr_model.fit(X_train, y_train)

# Step 7: Get predictions for the test set
predictions_test = lr_model.predict_proba(X_test)[:, 1]

# Step 8: Calculate evaluation metrics for the test set
AUC_test = roc_auc_score(y_test, predictions_test)
ACC_test = accuracy_score(y_test, (predictions_test > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (predictions_test > 0.5).astype(int))
Sn_test = recall_score(y_test, (predictions_test > 0.5).astype(int))
Sp_test = precision_score(y_test, (predictions_test > 0.5).astype(int))

# Step 9: Print the results
print("Validation Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))

print("Test Metrics:")
print("AUC_test: {:.4f}".format(AUC_test))
print("ACC_test: {:.4f}".format(ACC_test))
print("MCC_test: {:.4f}".format(MCC_test))
print("Sn_test: {:.4f}".format(Sn_test))
print("Sp_test: {:.4f}".format(Sp_test))

In [None]:
# XGBoost optimized
# Step 1: Import the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, matthews_corrcoef, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Step 2: Load your data and split it into features (X) and labels (y)

pos1 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_encoding_NCP_ND.csv') 
neg1 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_encoding_NCP_ND.csv')

pos2 = pd.read_csv('/Users/jiaming/Desktop/f5c/pos_domain_encoding.csv') 
neg2 = pd.read_csv('/Users/jiaming/Desktop/f5c/neg_domain_encoding.csv')

pos = pd.concat([pos1,pos2],axis = 1)
neg = pd.concat([neg1,neg2],axis = 1)

pos = pos.dropna()
neg = neg.dropna()

raw_datas = np.concatenate((pos, neg), axis=0)
raw_labels = np.concatenate(([1] * pos.shape[0], [0] * neg.shape[0]), axis=0)

np.random.seed(1)
indices = np.random.permutation(raw_labels.shape[0])


X = raw_datas[indices, :]
y = raw_labels[indices]

# Replace X_train, y_train, X_test, y_test with your actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Create the XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=1,
    max_depth=2,
    learning_rate=0.0001,
    gamma=0.1,
    reg_alpha=10.0,      # L1 regularization
    reg_lambda=100.0,     # L2 regularization
    subsample=0.5,
    colsample_bytree=0.2,
    random_state=42,
    eval_metric=["auc", "logloss"],
    early_stopping_rounds=1
)

# Step 5: Train the model
xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_train_scaled, y_train), (X_test_scaled, y_test)],
    verbose=1
)

# Step 6: Get predictions for the validation set
predictions_val = xgb_model.predict_proba(X_train_scaled)[:, 1]

# Step 7: Calculate evaluation metrics for the validation set
AUC_val = roc_auc_score(y_train, predictions_val)
ACC_val = accuracy_score(y_train, (predictions_val > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_train, (predictions_val > 0.5).astype(int))
Sn_val = recall_score(y_train, (predictions_val > 0.5).astype(int))
Sp_val = precision_score(y_train, (predictions_val > 0.5).astype(int))

# Step 8: Get predictions for the test set
predictions_test = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Step 9: Calculate evaluation metrics for the test set
AUC_test = roc_auc_score(y_test, predictions_test)
ACC_test = accuracy_score(y_test, (predictions_test > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (predictions_test > 0.5).astype(int))
Sn_test = recall_score(y_test, (predictions_test > 0.5).astype(int))
Sp_test = precision_score(y_test, (predictions_test > 0.5).astype(int))

# Step 10: Print the results
print("Validation Metrics:")
print("AUC_val: {:.4f}".format(AUC_val))
print("ACC_val: {:.4f}".format(ACC_val))
print("MCC_val: {:.4f}".format(MCC_val))
print("Sn_val: {:.4f}".format(Sn_val))
print("Sp_val: {:.4f}".format(Sp_val))

print("Test Metrics:")
print("AUC_test: {:.4f}".format(AUC_test))
print("ACC_test: {:.4f}".format(ACC_test))
print("MCC_test: {:.4f}".format(MCC_test))
print("Sn_test: {:.4f}".format(Sn_test))
print("Sp_test: {:.4f}".format(Sp_test))