In [40]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score, f1_score, matthews_corrcoef,recall_score, precision_score
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import TensorDataset, DataLoader, random_split

In [41]:
loaded_datasets_info = torch.load('/root/autodl-tmp/imgs/SWE/saved_datasets_SWE.pth')
loaded_train_dataset = loaded_datasets_info['train_dataset']
loaded_val_dataset = loaded_datasets_info['val_dataset']
loaded_test_dataset = loaded_datasets_info['test_dataset']

  loaded_datasets_info = torch.load('/root/autodl-tmp/imgs/SWE/saved_datasets_SWE.pth')


In [42]:
from torch.utils.data import DataLoader

def extract_features_labels_from_subset(subset):
    
    loader = DataLoader(subset, batch_size=len(subset))
    
    for features, labels in loader:
        features = features.numpy().reshape(features.shape[0], -1)
        labels = labels.squeeze(-1).numpy()
        return features, labels

X_train, y_train = extract_features_labels_from_subset(loaded_train_dataset)
X_val, y_val = extract_features_labels_from_subset(loaded_val_dataset)
X_test, y_test = extract_features_labels_from_subset(loaded_test_dataset)

# 1. XGB

In [43]:
import xgboost as xgb
import matplotlib.pyplot as plt

In [44]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',  
    learning_rate=0.1,
    max_depth=5,
    n_estimators=30)

xgb_model.fit(X_train, y_train)

In [45]:
# torch.save(xgb_model.state_dict(), f'/Users/jiaming/Desktop/Lab2/datas/ROC/xgb_model.pth') # only used for nn
# xgb_model.save_model('/Users/jiaming/Desktop/Lab2/datas/ROC/models/xgb_model.json') 

In [46]:
y_pred = xgb_model.predict(X_val)
AUC_val = roc_auc_score(y_val, y_pred)
ACC_val = accuracy_score(y_val, (y_pred > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_val, (y_pred > 0.5).astype(int))
Sn_val = recall_score(y_val, (y_pred > 0.5).astype(int))
Sp_val = precision_score(y_val, (y_pred > 0.5).astype(int))
F1_val = f1_score(y_val, (y_pred > 0.5).astype(int))
print(AUC_val, ACC_val, MCC_val, Sn_val, Sp_val, F1_val)

0.9090909090909091 0.828125 0.6598147103950808 0.7878787878787878 0.8666666666666667 0.8253968253968254


In [47]:
np.save('/root/autodl-tmp/ROC/SWE/ML/XGB/y_val_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/XGB/y_val.npy', y_val)

In [48]:
y_pred = xgb_model.predict(X_test)
AUC_test = roc_auc_score(y_test, y_pred)
ACC_test = accuracy_score(y_test, (y_pred > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
Sn_test = recall_score(y_test, (y_pred > 0.5).astype(int))
Sp_test = precision_score(y_test, (y_pred > 0.5).astype(int))
F1_test = f1_score(y_test, (y_pred > 0.5).astype(int))
print(AUC_test, ACC_test, MCC_test, Sn_test, Sp_test, F1_test)

0.9568480300187617 0.9 0.8006294769296238 0.926829268292683 0.8837209302325582 0.9047619047619047


In [49]:
np.save('/root/autodl-tmp/ROC/SWE/ML/XGB/y_test_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/XGB/y_test.npy', y_test)

# 2. SVM

In [50]:
from sklearn.svm import SVC

In [51]:
svm_model = SVC(C=3.0, probability=True)
svm_model.fit(X_train, y_train)

In [52]:
# from joblib import dump, load

# Assuming svm_model is your trained SVM model
# dump(svm_model, '/Users/jiaming/Desktop/Lab2/datas/ROC/models/svm_model.joblib')

# To load the model back from the file
# svm_model = load('/Users/jiaming/Desktop/Lab2/datas/ROC/models/svm_model.joblib')

In [53]:
y_pred = svm_model.predict_proba(X_val)[:, 1]
AUC_val = roc_auc_score(y_val, y_pred)
ACC_val = accuracy_score(y_val, (y_pred > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_val, (y_pred > 0.5).astype(int))
Sn_val = recall_score(y_val, (y_pred > 0.5).astype(int))
Sp_val = precision_score(y_val, (y_pred > 0.5).astype(int))
F1_val = f1_score(y_val, (y_pred > 0.5).astype(int))
print(AUC_val, ACC_val, MCC_val, Sn_val, Sp_val, F1_val)

0.9765395894428153 0.9375 0.8813677252609423 1.0 0.8918918918918919 0.9428571428571428


In [54]:
np.save('/root/autodl-tmp/ROC/SWE/ML/SVM/y_val_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/SVM/y_val.npy', y_val)

In [55]:
y_pred = svm_model.predict_proba(X_test)[:, 1]
AUC_test = roc_auc_score(y_test, y_pred)
ACC_test = accuracy_score(y_test, (y_pred > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
Sn_test = recall_score(y_test, (y_pred > 0.5).astype(int))
Sp_test = precision_score(y_test, (y_pred > 0.5).astype(int))
F1_test = f1_score(y_test, (y_pred > 0.5).astype(int))
print(AUC_test, ACC_test, MCC_test, Sn_test, Sp_test, F1_test)

0.9968730456535335 0.95 0.90094327200303 0.975609756097561 0.9302325581395349 0.9523809523809524


In [56]:
np.save('/root/autodl-tmp/ROC/SWE/ML/SVM/y_test_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/SVM/y_test.npy', y_test)

# 3. RF

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [59]:
# rf_model.save_model('/Users/jiaming/Desktop/Lab2/datas/ROC/models/rf_model.json') 
# from joblib import dump, load

# dump(rf_model, '/Users/jiaming/Desktop/Lab2/datas/ROC/models/rf_model.joblib')

# To load the model back from the file
# rf_model = load('/Users/jiaming/Desktop/Lab2/datas/ROC/models/rf_model.joblib')

In [60]:
y_pred = rf_model.predict_proba(X_val)[:, 1]
AUC_val = roc_auc_score(y_val, y_pred)
ACC_val = accuracy_score(y_val, (y_pred > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_val, (y_pred > 0.5).astype(int))
Sn_val = recall_score(y_val, (y_pred > 0.5).astype(int))
Sp_val = precision_score(y_val, (y_pred > 0.5).astype(int))
F1_val = f1_score(y_val, (y_pred > 0.5).astype(int))
print(AUC_val, ACC_val, MCC_val, Sn_val, Sp_val, F1_val)

0.9604105571847508 0.953125 0.9099214192705388 1.0 0.9166666666666666 0.9565217391304348


In [61]:
np.save('/root/autodl-tmp/ROC/SWE/ML/RF/y_val_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/RF/y_val.npy', y_val)

In [62]:
y_pred = rf_model.predict_proba(X_test)[:, 1]
AUC_test = roc_auc_score(y_test, y_pred)
ACC_test = accuracy_score(y_test, (y_pred > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
Sn_test = recall_score(y_test, (y_pred > 0.5).astype(int))
Sp_test = precision_score(y_test, (y_pred > 0.5).astype(int))
F1_test = f1_score(y_test, (y_pred > 0.5).astype(int))
print(AUC_test, ACC_test, MCC_test, Sn_test, Sp_test, F1_test)

0.9884302689180737 0.9375 0.8751177566699507 0.9512195121951219 0.9285714285714286 0.9397590361445782


In [63]:
np.save('/root/autodl-tmp/ROC/SWE/ML/RF/y_test_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/RF/y_test.npy', y_test)

# 4. NB 

In [64]:
from sklearn.naive_bayes import GaussianNB

In [65]:
nb_model = GaussianNB()
# nb_model.fit(X_train[:, 10000:10200], y_train)
nb_model.fit(X_train, y_train)

In [66]:
# from joblib import dump, load
# dump(nb_model, '/Users/jiaming/Desktop/Lab2/datas/ROC/models/nb_model.joblib')

In [67]:
# y_pred = nb_model.predict_proba(X_val[:, 10000:10200])[:, 1]
y_pred = nb_model.predict_proba(X_val)[:, 1]
AUC_val = roc_auc_score(y_val, y_pred)
ACC_val = accuracy_score(y_val, (y_pred > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_val, (y_pred > 0.5).astype(int))
Sn_val = recall_score(y_val, (y_pred > 0.5).astype(int))
Sp_val = precision_score(y_val, (y_pred > 0.5).astype(int))
F1_val = f1_score(y_val, (y_pred > 0.5).astype(int))
print(AUC_val, ACC_val, MCC_val, Sn_val, Sp_val, F1_val)

0.9032258064516129 0.90625 0.8260642432614047 1.0 0.8461538461538461 0.9166666666666666


In [68]:
np.save('/root/autodl-tmp/ROC/SWE/ML/NB/y_val_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/NB/y_val.npy', y_val)

In [69]:
# y_pred = nb_model.predict_proba(X_test[:, 10000:10200])[:, 1]
y_pred = nb_model.predict_proba(X_test)[:, 1]
AUC_test = roc_auc_score(y_test, y_pred)
ACC_test = accuracy_score(y_test, (y_pred > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
Sn_test = recall_score(y_test, (y_pred > 0.5).astype(int))
Sp_test = precision_score(y_test, (y_pred > 0.5).astype(int))
F1_test = f1_score(y_test, (y_pred > 0.5).astype(int))
print(AUC_test, ACC_test, MCC_test, Sn_test, Sp_test, F1_test)

0.8980612883051907 0.9 0.8083478227014413 0.975609756097561 0.851063829787234 0.9090909090909092


In [70]:
np.save('/root/autodl-tmp/ROC/SWE/ML/NB/y_test_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/NB/y_test.npy', y_test)

# 5. LR (Logistic Regresssion)

In [71]:
from sklearn.linear_model import LogisticRegression

In [72]:
lr_model = LogisticRegression(max_iter=100, 
                              C=0.001, 
                              penalty='l2', 
                              solver='liblinear', 
                              random_state=42)
lr_model.fit(X_train, y_train)

In [73]:
# from joblib import dump, load
# dump(lr_model, '/Users/jiaming/Desktop/Lab2/datas/ROC/models/lr_model.joblib')

In [74]:
y_pred = lr_model.predict_proba(X_val)[:, 1]
AUC_val = roc_auc_score(y_val, y_pred)
ACC_val = accuracy_score(y_val, (y_pred > 0.5).astype(int))
MCC_val = matthews_corrcoef(y_val, (y_pred > 0.5).astype(int))
Sn_val = recall_score(y_val, (y_pred > 0.5).astype(int))
Sp_val = precision_score(y_val, (y_pred > 0.5).astype(int))
F1_val = f1_score(y_val, (y_pred > 0.5).astype(int))
print(AUC_val, ACC_val,MCC_val,Sn_val,Sp_val, F1_val)

0.9833822091886608 0.953125 0.9099214192705388 1.0 0.9166666666666666 0.9565217391304348


In [75]:
np.save('/root/autodl-tmp/ROC/SWE/ML/LR/y_val_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/LR/y_val.npy', y_val)

In [76]:
y_pred = lr_model.predict_proba(X_test)[:, 1]
AUC_test = roc_auc_score(y_test, y_pred)
ACC_test = accuracy_score(y_test, (y_pred > 0.5).astype(int))
MCC_test = matthews_corrcoef(y_test, (y_pred > 0.5).astype(int))
Sn_test = recall_score(y_test, (y_pred > 0.5).astype(int))
Sp_test = precision_score(y_test, (y_pred > 0.5).astype(int))
F1_test = f1_score(y_test, (y_pred > 0.5).astype(int))
print(AUC_test, ACC_test, MCC_test,Sn_test,Sp_test, F1_test)

0.9937460913070669 0.975 0.9499687304565353 0.975609756097561 0.975609756097561 0.975609756097561


In [77]:
np.save('/root/autodl-tmp/ROC/SWE/ML/LR/y_test_pred.npy', y_pred)
np.save('/root/autodl-tmp/ROC/SWE/ML/LR/y_test.npy', y_test)

In [78]:
# check the weight distribution 
# weights = lr_model.coef_
# intercept = lr_model.intercept_
# 
# print(f"Weights (coefficients): {weights}")
# print(f"Intercepts: {intercept}")