In [None]:
import pandas as pd
import numpy as np
import sklearn
import torch
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, LeakyReLU, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split

In [None]:
class BLSTM:
    def __init__(self, data_path, name="graph2vec_BLSTM", batch_size=64, epochs=20):
        self.name = name
        self.batch_size = batch_size
        self.epochs = epochs

        # Load and preprocess data
        self._load_data(data_path)

        # Build and compile model
        self.model = self._build_model()

    def _load_data(self, data_path):
        data = pd.read_csv(data_path)  # Load graph2vec features
        print("Dataset Preview:")
        print(data.head())

        indices = data['type'].values
        y = data.iloc[:, 4]
        y_labels = y.iloc[indices]
        
        X = data.drop(columns="type")
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42)
        print("Train data shape:", self.X_train.shape)
        print("Test data shape:", self.X_test.shape)
        

    def _build_model(self):
        model = Sequential()
        model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(self.X_train.shape[1], self.X_train.shape[2])))
        model.add(Dropout(0.5))
        model.add(BatchNormalization())
        model.add(Bidirectional(LSTM(64)))
        model.add(Dropout(0.5))
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer=Adam(learning_rate=0.02), loss='binary_crossentropy', metrics=['accuracy', 'AUC'])
        return model

    def train(self):
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model_checkpoint = ModelCheckpoint(self.name + "_best_model.weights.keras", save_best_only=True, monitor='val_loss')

        history = self.model.fit(
            self.X_train, self.y_train,
            validation_data=(self.X_test, self.y_test),
            batch_size=self.batch_size,
            epochs=self.epochs,
            callbacks=[early_stopping, model_checkpoint]
        )

        self._plot_learning_curve(history)

    def _plot_learning_curve(self, history):
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['accuracy'], label='Train Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.ylim(0, 1)
        plt.title('Model Accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(loc='upper left')
        plt.show()

        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.ylim(0, 2)
        plt.title('Model Loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(loc='upper left')
        plt.show()

    def test(self):
        self.model.load_weights(self.name + "_best_model.weights.keras")
        results = self.model.evaluate(self.X_test, self.y_test)
        print("Test loss:", results[0])
        print("Test accuracy:", results[1])
        print("Test AUC:", results[2])

# Usage:
# blstm = BLSTM("graph2vec_features.csv")
# blstm.train()
# blstm.test()

In [None]:
import torch
from torch.nn import Linear, ReLU, Sigmoid
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, TopKPooling, global_mean_pool
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

embedding_size = 64

class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        torch.manual_seed(32)
        
        # GCN layers
        self.initial_conv = GCNConv(dataset.num_features, embedding_size) # To transform input features to the size of the embeddings
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        
        # Output layer
        self.lin1 = Linear(embedding_size*2, 128) # Concatenation of global pooling results
        self.lin2 = Linear(128, 64)
        self.lin3 = Linear(64, 1)
        
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()
    
    def forward(self, x, edge_index, batch_index):
        # First Conv layer
        hidden = self.initial_conv(x, edge_index)
        hidden = F.relu(hidden)
        
        # Other Conv layers
        hidden = self.conv1(hidden, edge_index)
        hidden = F.relu(hidden)
        
        hidden = self.conv2(hidden, edge_index)
        hidden = F.relu(hidden)
        
        # Global Pooling
        hidden = torch.cat([gmp(hidden, batch_index), 
                            gap(hidden, batch_index)], 
                            dim=1)
        
        # Apply some linear layers
        out = self.lin1(hidden) 
        out = self.act1(out)
        out = self.lin2(out)
        out = self.act2(out)
        out = self.lin3(out)
        out = torch.sigmoid(out)
        
        return out

model = GNN()
print(model)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

def train():
    model.train()
    loss_all = 0
    
    for data in train_dataset_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = gnn_model(data.x, data.edge_index, data.batch)
        loss = criterion(output, data.y.float())
        loss.backward()
        loss_all += loss.item()
        optimizer.step()
    return loss_all / len(train_dataset_loader)

def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            # pred = model(data.x.float(), data.edge_index, data.batch).detach().cpu().numpy()
            pred = model(data.x.float(), data.edge_index, data.batch)
            label_true = data.y.to(device)
            label = data.y.detach().cpu().numpy()
            # predictions.append(pred)
            # labels.append(label)
            predictions.append(np.rint(pred.cpu().detach().numpy()))
            labels.append(label)
            loss = loss_fn(pred.squeeze(), label_true.float())
    # predictions = np.hstack(predictions)
    # labels = np.hstack(labels)
    predictions = np.concatenate(predictions).ravel()
    labels = np.concatenate(labels).ravel()

    # print(predictions)
    # print(labels)
    return accuracy_score(labels, predictions), loss

    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn = torch.nn.BCELoss()


print("Starting training...")
train_losses = []
val_losses = []
val_acc_list= []
train_acc_list= []
best_loss = 1000
early_stopping_counter = 0
for epoch in range(200):
    if early_stopping_counter <=  5: # = x * 5 
        loss = train()
        train_losses.append(loss)
        train_acc, train_loss = evaluate(train_dataset_loader)
        #val_acc = evaluate(val_loader)    
        val_acc, val_loss = evaluate(val_dataset_loader)
        val_losses.append(val_loss)
        val_acc_list.append(val_acc)
        train_acc_list.append(train_acc)
    
        if float(val_loss) < best_loss:
            best_loss = val_loss
            # Save the currently best model 
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
        print(f"Epoch {epoch} | Train Loss {loss} | Train Accuracy{train_acc} | Validation Accuracy{val_acc} | Validation loss{val_loss}")

    else:
        print("Early stopping due to no improvement.")
        break
print(f"Finishing training with best val loss: {best_loss}")    

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'dataset' is not defined