In [None]:
import os
import torch
import gpytorch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from gpytorch.models import ApproximateGP
from gpytorch.variational import BatchDecoupledVariationalStrategy
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import PredictiveLogLikelihood
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score,recall_score,precision_score
import torch.optim as optim
from sklearn.model_selection import KFold
import joblib

In [None]:
os.chdir("path to save the trained models")

In [None]:
df=pd.read_csv('path to training dataset')
# Assume the last column is the target and the rest are features
X = df.iloc[:, 4:-1]
y = df.iloc[:, -1].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Save the fitted scaler for later use during prediction
joblib.dump(scaler, 'scaler.gz')

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, use_instance_norm=False):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.bn1 = nn.InstanceNorm1d(out_channels) if use_instance_norm else nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.InstanceNorm1d(out_channels) if use_instance_norm else nn.BatchNorm1d(out_channels)
        
        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.InstanceNorm1d(out_channels) if use_instance_norm else nn.BatchNorm1d(out_channels),
            )

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out

class ResNet1D(nn.Module):
    def __init__(self):
        super(ResNet1D, self).__init__()
        self.conv1 = nn.Conv1d(1, 40, kernel_size=7, stride=2, padding=3)  # Initial convolution
        self.bn1 = nn.BatchNorm1d(40)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = ResidualBlock(40, 40)
        self.layer2 = ResidualBlock(40, 64, stride=2)
        self.layer3 = ResidualBlock(64, 128, stride=2)
        self.layer4 = ResidualBlock(128, 256, stride=2)
        
        
        # Adding Dropout here 
        self.dropout = nn.Dropout(0.5)  
        
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(256, 1)  # first full-connected
        

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # Apply dropout after Layer 2 and before global pooling
        x = self.dropout(x)

        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [None]:
class ResNetGP(nn.Module):
    def __init__(self, feature_extractor):
        super(ResNetGP, self).__init__()
        self.feature_extractor = feature_extractor
        # Initialize GP Model assuming the last layer of the feature extractor is fully connected
        self.gp = GPModel(self.feature_extractor.fc.out_features)

    def forward(self, x):
        # First, pass inputs through the feature extractor
        features = self.feature_extractor(x)
        # Ensure features are flattened if not already (important if coming from CNNs)
        if features.dim() > 2:
            features = features.view(features.size(0), -1)
        # Then pass the features through the GP model
        output = self.gp(features)
        return output

class GPModel(ApproximateGP):
    def __init__(self, feature_dim):
        variational_distribution = CholeskyVariationalDistribution(512)  # Number of inducing points
        variational_strategy = BatchDecoupledVariationalStrategy(
            self, torch.zeros(512, feature_dim), variational_distribution, learn_inducing_locations=True
        )
        super(GPModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# Assuming ResNet1D is properly defined
feature_extractor = ResNet1D()
model = ResNetGP(feature_extractor)

In [None]:
# Prepare data for PyTorch
X_k = torch.tensor(X_scaled, dtype=torch.float32).unsqueeze(1)  # Add channel dimension
y_k = torch.tensor(y, dtype=torch.long)
dataset = TensorDataset(X_k, y_k)

def train_and_evaluate(model, train_loader, val_loader, likelihood, optimizer, epochs=30):
    # Train the model
    model.train()
    likelihood.train()
    for _ in range(epochs):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = -model_likelihood(output, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    likelihood.eval()
    true_labels = []
    pred_probs = []
    with torch.no_grad():
        for x, y in val_loader:
            preds = model(x)
            prob_pos = likelihood(preds).mean.detach()
            true_labels.extend(y.numpy())
            prob_pos = prob_pos.view(-1) 
            pred_probs.extend(prob_pos.view(-1).tolist())  # Convert tensor to list before extending
#            pred_probs.extend(prob_pos.numpy())

    accuracy = accuracy_score(true_labels, (np.array(pred_probs) >= 0.7).astype(int))
    auc = roc_auc_score(true_labels, pred_probs)
    f1 = f1_score(true_labels, (np.array(pred_probs) >= 0.7).astype(int))
    recall = recall_score(true_labels, (np.array(pred_probs) >= 0.7).astype(int))
    precision = precision_score(true_labels, (np.array(pred_probs) >= 0.7).astype(int))
    return accuracy, auc, f1, recall, precision

In [None]:
# Setup K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_k)):
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    #Stochastic minibatching
    train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)

    # Instantiate the model, likelihood, and optimizer
    feature_extractor = ResNet1D()  
    model = ResNetGP(feature_extractor)
    likelihood = GaussianLikelihood()
    model_likelihood = PredictiveLogLikelihood(likelihood, model.gp, num_data=len(train_loader.dataset))
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},
        {'params': likelihood.parameters()}
    ], lr=0.001,weight_decay=1e-4)

    # Train and evaluate the model
    accuracy, auc, f1,recall, precision = train_and_evaluate(model, train_loader, val_loader, likelihood, optimizer)
    results.append((accuracy, auc, f1,recall, precision))
    print(f"Fold {fold+1}: Accuracy={accuracy:.4f}, AUC={auc:.4f}, F1={f1:.4f},recall={recall:.4f}, precision={precision:.4f}")

    # Save the model and likelihood parameters
    torch.save({
        'model_state_dict': model.state_dict(),
        'likelihood_state_dict': likelihood.state_dict()
    }, f"model_and_likelihood_fold_{fold+1}.pth")

# Calculate average metrics across folds
average_accuracy = np.mean([res[0] for res in results])
average_auc = np.mean([res[1] for res in results])
average_f1 = np.mean([res[2] for res in results])
average_recall = np.mean([res[3] for res in results])
average_precision = np.mean([res[4] for res in results])
print(f"Average Accuracy: {average_accuracy:.4f}, Average AUC: {average_auc:.4f}, Average F1: {average_f1:.4f}, Average recall: {average_recall:.4f}, Average precision: {average_precision:.4f}")

In [None]:
# Setup K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_k)):
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    #Stochastic minibatching
    train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)

    # Instantiate the model, likelihood, and optimizer
    feature_extractor = ResNet1D()  
    model = ResNetGP(feature_extractor)
    likelihood = GaussianLikelihood()
    model_likelihood = PredictiveLogLikelihood(likelihood, model.gp, num_data=len(train_loader.dataset))
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},
        {'params': likelihood.parameters()}
    ], lr=0.001,weight_decay=1e-4)

    # Train and evaluate the model
    accuracy, auc, f1,recall, precision = train_and_evaluate(model, train_loader, val_loader, likelihood, optimizer)
    results.append((accuracy, auc, f1,recall, precision))
    print(f"Fold {fold+1}: Accuracy={accuracy:.4f}, AUC={auc:.4f}, F1={f1:.4f},recall={recall:.4f}, precision={precision:.4f}")

    # Save the model and likelihood parameters
    torch.save({
        'model_state_dict': model.state_dict(),
        'likelihood_state_dict': likelihood.state_dict()
    }, f"model_and_likelihood_fold_{fold+1}.pth")

# Calculate average metrics across folds
average_accuracy = np.mean([res[0] for res in results])
average_auc = np.mean([res[1] for res in results])
average_f1 = np.mean([res[2] for res in results])
average_recall = np.mean([res[3] for res in results])
average_precision = np.mean([res[4] for res in results])
print(f"Average Accuracy: {average_accuracy:.4f}, Average AUC: {average_auc:.4f}, Average F1: {average_f1:.4f}, Average recall: {average_recall:.4f}, Average precision: {average_precision:.4f}")

In [None]:
model_folder = 'path to load trained models'  # Use raw string for Windows paths

def load_model_and_likelihood(model_path):
    # Create an instance of the feature extractor
    feature_extractor = ResNet1D()
    # Create an instance of the ResNetGP model with the feature extractor
    model = ResNetGP(feature_extractor)
    likelihood = GaussianLikelihood()  # Initialize the likelihood

    # Load the saved state dictionaries
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    likelihood.load_state_dict(checkpoint['likelihood_state_dict'])

    model.eval()  # Set the model to evaluation mode
    likelihood.eval()  # Set the likelihood to evaluation mode
    return model, likelihood  # Return both as a tuple

models_and_likelihoods = [load_model_and_likelihood(os.path.join(model_folder, f'model_and_likelihood_fold_{fold}.pth')) for fold in range(1, 6)]

In [None]:
def prepare_data(file_path, scaler_path='scaler.gz'):
    data = pd.read_csv(file_path)
    features = data.iloc[:, 4:-10].values
    scaler = joblib.load(scaler_path)  # Load the pre-fitted scaler
    features_normalized = scaler.transform(features)  # Transform, not fit_transform!
    
    features_tensor = torch.tensor(features_normalized, dtype=torch.float32).unsqueeze(1)
    return data, DataLoader(TensorDataset(features_tensor), batch_size=128, shuffle=False)

def predict_with_models(models_and_likelihoods, data_loader):
    all_preds = []
    with torch.no_grad():
        for batch in data_loader:
            data = batch[0]
            # Unpack and use each model and likelihood correctly
            probs = [likelihood(model(data)).mean for model, likelihood in models_and_likelihoods]
            mean_probs = torch.stack(probs).mean(dim=0)
            predicted_classes = (mean_probs >= 0.7).int()
            all_preds.extend(predicted_classes.cpu().numpy())
    return all_preds

def process_and_predict(folder_path, output_folder):
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            original_data, data_loader = prepare_data(file_path)
            predictions = predict_with_models(models_and_likelihoods, data_loader)
            original_data['FR'] = predictions  # Append predictions as a new column
            # Save updated DataFrame
            save_path = os.path.join(output_folder, f'{filename[:-4]}_with_predictions.csv')
            original_data.to_csv(save_path, index=False)

In [None]:
input_folder = 'path to load historical csv files'
output_folder = 'path to save the predict results'
process_and_predict(input_folder,output_folder)