In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, AdamW, get_linear_schedule_with_warmup , BertModel
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score
import os


In [None]:
dataset = pd.read_csv("/sise/home/adamu/thesis_new/datasets/PPIMI_datastes_my_samping/final_dataset_2_0.8_Multi_PPIMI.csv")
esm_features = pd.read_csv("/sise/home/adamu/thesis_new/feature_extraction/outputs/esm_features.csv")
uniprot_mapping = pd.read_csv("/sise/home/adamu/thesis_new/feature_extraction/data/idmapping_2024_06_04.tsv",  delimiter = "\t")
uniprot_mapping

In [None]:
def convert_uniprot_ids(dataset, mapping_df):
    # Create a dictionary from the mapping dataframe
    mapping_dict = mapping_df.set_index('From')['Entry'].to_dict()

    # Map the uniprot_id1 and uniprot_id2 columns to their respective Entry values
    dataset['uniprot_id1'] = dataset['uniprot_id1'].map(mapping_dict)
    dataset['uniprot_id2'] = dataset['uniprot_id2'].map(mapping_dict)
    return dataset.drop_duplicates()

def merge_datasets(dataset, features_df):
    # Merge features for uniprot_id1
    dataset = dataset.merge(features_df, how='left', left_on='uniprot_id1', right_on='UniProt_ID', suffixes=('', '_id1'))
    dataset = dataset.drop(columns=['UniProt_ID'])

    # Merge features for uniprot_id2
    features_df_renamed = features_df.add_suffix('_id2')
    features_df_renamed = features_df_renamed.rename(columns={'UniProt_ID_id2': 'UniProt_ID'})
    dataset = dataset.merge(features_df_renamed, how='left', left_on='uniprot_id2', right_on='UniProt_ID', suffixes=('', '_id2'))
    dataset = dataset.drop(columns=['UniProt_ID', "uniprot_id1", "uniprot_id2"])

    return dataset.drop_duplicates()


In [None]:
def add_features_to_dataframe(dataset):
    dataset = convert_uniprot_ids(dataset, uniprot_mapping)
    dataset = merge_datasets(dataset, esm_features)
    return dataset

In [None]:
dataset = add_features_to_dataframe(dataset)

In [None]:
train_test_dict = dict()
my_splits_path = "/sise/home/adamu/thesis_new/datasets/folds/my folds/cold start PPI/"
for fold in range(1,6):
    train_file = os.path.join(my_splits_path, f'train_fold{fold}.csv')
    test_file = os.path.join(my_splits_path, f'test_fold{fold}.csv')
    train_df_current = pd.read_csv(train_file)
    test_df_current = pd.read_csv(test_file)
    print(f"shape of train in fold {fold} before adding features: {train_df_current.shape}")
    print(f"shape of test in fold {fold} before adding features: {test_df_current.shape}")
    train_df_current = add_features_to_dataframe(train_df_current)
    test_df_current = add_features_to_dataframe(test_df_current)
    print(f"shape of train in fold {fold} after adding features: {train_df_current.shape}")
    print(f"shape of test in fold {fold} after adding features: {test_df_current.shape}")
    train_test_dict[f"fold{fold}"] = (train_df_current, test_df_current)

In [None]:
for fold, (train, test) in train_test_dict.items():
    print(f"{fold}")
    rows_with_nan = test[test.isna().any(axis=1)]
    if not rows_with_nan.empty:
        nan_cols = rows_with_nan.columns[rows_with_nan.isna().any()].tolist() + ["uniprot_id1", "uniprot_id2"]
        print(rows_with_nan[nan_cols])
    else:
        print("No NaN values found in this fold.")
    rows_with_nan = train[train.isna().any(axis=1)]
    if not rows_with_nan.empty:
        nan_cols = rows_with_nan.columns[rows_with_nan.isna().any()].tolist() + ["uniprot_id1", "uniprot_id2"]
        print(rows_with_nan[nan_cols])
    else:
        print("No NaN values found in this fold.")

In [None]:
train_test_dict[f"fold1"][0]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel, RobertaTokenizer

class CustomDataset(Dataset):
    def __init__(self, encoded_smiles, ppi_features, labels):
        self.encoded_smiles = encoded_smiles
        self.ppi_features = torch.tensor(ppi_features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

        # Debugging print statements
        print(f"Encoded SMILES size: {len(self.encoded_smiles['input_ids'])}")
        print(f"PPI Features size: {self.ppi_features.shape}")
        print(f"Labels size: {self.labels.shape}")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encoded_smiles["input_ids"][idx],
            "attention_mask": self.encoded_smiles["attention_mask"][idx],
            "ppi_features": self.ppi_features[idx],
            "labels": self.labels[idx]
        }
        return item

class ChemBERTaWithPPI(nn.Module):
    def __init__(self, model_name, ppi_feature_size, hidden_size1=1024, hidden_size2=512, hidden_size3 = 256):
        super(ChemBERTaWithPPI, self).__init__()
        self.chemberta = RobertaModel.from_pretrained(model_name)
        self.ppi_fc = nn.Linear(ppi_feature_size, hidden_size1)  # Ensure output matches BERT's dimension
        self.hidden_layer1 = nn.Linear(1408, hidden_size1)  # First hidden layer
        self.hidden_layer2 = nn.Linear(hidden_size1, hidden_size2)  # New added hidden layer
        self.hidden_layer3 = nn.Linear(hidden_size2, hidden_size3)  # New added hidden layer
        self.classifier = nn.Linear(hidden_size3, 1)  # Adjusted for the output of the second hidden layer

    def forward(self, input_ids, attention_mask, ppi_features):
        ppi_out = self.ppi_fc(ppi_features)

        # Expand the dimensions of ppi_out to match bert_output[0]
        ppi_out = ppi_out.unsqueeze(1).expand(-1, input_ids.size(1), -1)

        bert_output = self.chemberta(input_ids=input_ids, attention_mask=attention_mask)
        integrated_output = torch.cat((bert_output[0], ppi_out), dim=-1)  # Concatenate along the last dimension

        # Average pooling over the sequence length dimension
        pooled_output = integrated_output.mean(dim=1)

        # Passing through the first hidden layer with a ReLU activation function
        hidden_output1 = F.relu(self.hidden_layer1(pooled_output))

        # Passing through the second hidden layer with a ReLU activation function
        hidden_output2 = F.relu(self.hidden_layer2(hidden_output1))

        # Passing through the second hidden layer with a ReLU activation function
        hidden_output3 = F.relu(self.hidden_layer3(hidden_output2))

        logits = self.classifier(hidden_output3)
        return logits

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

for fold in range(3, 6):
    print("fold:", fold)
    # Load your data
    train_df = train_test_dict[f"fold{fold}"][0]
    valid_df = train_test_dict[f"fold{fold}"][1]
    len_train = len(train_df)
    len_val = len(valid_df)
    dataset = pd.concat([train_df, valid_df], axis=0).reset_index(drop=True)
    train_df = dataset.iloc[:len_train, :]
    test_df = dataset.iloc[len_train:, :]
    dataset.iloc[:, 2:] = dataset.iloc[:, 2:].astype(float)
    train_df.iloc[:, 2:] = train_df.iloc[:, 2:].astype(float)
    valid_df.iloc[:, 2:] = valid_df.iloc[:, 2:].astype(float)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Extract SMILES strings, PPI features, and labels
    smiles_list = dataset['smiles'].tolist()
    ppi_features = dataset.iloc[:, 2:].values  # Excluding SMILES and label columns
    train_labels = train_df["label"].values
    valid_labels = valid_df["label"].values
    tokenizer = RobertaTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
    encoded_smiles = tokenizer(smiles_list, truncation=True, padding=True, return_tensors="pt")

    # Convert these splits into their respective Datasets and DataLoaders
    train_dataset = CustomDataset(tokenizer(train_df['smiles'].tolist(), truncation=True, padding=True, return_tensors="pt"), train_df.iloc[:, 2:].values, train_labels)
    valid_dataset = CustomDataset(tokenizer(valid_df['smiles'].tolist(), truncation=True, padding=True, return_tensors="pt"), valid_df.iloc[:, 2:].values, valid_labels)

    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

    # Check lengths of DataLoader and Dataset
    print(f"Length of train DataLoader: {len(train_dataloader.dataset)}")
    print(f"Length of valid DataLoader: {len(valid_dataloader.dataset)}")

    # Assuming you have your data loaded in a DataLoader named `dataloader`
    model_name = "DeepChem/ChemBERTa-77M-MTR"
    model = ChemBERTaWithPPI(model_name, ppi_feature_size=2560).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    from tqdm import tqdm

    num_epochs = 50
    patience = 11
    best_auc = 0
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        # Training loop
        model.train()
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=True)
        for batch in progress_bar:
            # Move batch data to the chosen device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            ppi_features = batch["ppi_features"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask, ppi_features)
            loss = criterion(logits.squeeze(-1), labels)
            loss.backward()
            optimizer.step()

            # Optionally, update the progress bar description with the current loss
            progress_bar.set_postfix({'loss': loss.item()})
        # Validation loop
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in valid_dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                ppi_features = batch["ppi_features"].to(device)
                labels = batch["labels"].to(device)

                logits = model(input_ids, attention_mask, ppi_features)
                predictions = torch.sigmoid(logits).squeeze(-1)
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        auc = roc_auc_score(all_labels, all_preds)
        print(f"Epoch {epoch + 1}/{num_epochs} - Validation AUC: {auc:.4f}")

        # Early stopping
        if auc > best_auc:
            best_auc = auc
            epochs_without_improvement = 0
            # Optionally, save the best model
            # torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement == patience:
                print("Early stopping due to no improvement in validation AUC.")
                break
