In [None]:
!pip install pandas==2.1.0
!pip install datasets
!pip install transformers[torch]
#!pip install scikit-learn
!pip install torch_optimizer

In [None]:
import os
import copy
import collections
import numpy as np
import pandas as pd
from typing import Type, Tuple

import datasets
from sklearn import metrics

import torch
import torch_optimizer as optim
from torch.utils.data import Dataset, IterableDataset, DataLoader, WeightedRandomSampler

import transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Prepare Dataset

In [None]:
dataset_name = 'CRPC'

def data_processor(df):
    df['info'] = df['Title'].astype(str) + df['Abstract'].astype(str)
    #           train selection                              irrelevant==20                          relevant==10    dropping group & resetting index
    train = df.groupby('Include/Exclude').apply(lambda group: group.head(30) if group.name == 0 else group.head(10)).reset_index(drop=True)

    X_train = train['info'].values.reshape(-1)
    y_train = train['Include/Exclude'].values.reshape(-1)

    #           test selection                                  ratio (irrelevant=2 : relevant=1)    dropping group & resetting index
    test = df#.groupby('Include/Exclude').apply(lambda group: group.head(50) if group.name == 0 else group.head(50)).reset_index(drop=True)
    X_test = test['info'].values.reshape(-1)
    y_test = test['Include/Exclude'].values.reshape(-1)

    return X_train, y_train, X_test, y_test

if dataset_name == 'NSCLC':
    df1 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/01. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'Include/Exclude'], header=0).replace({'Exclude': 0, 'Include':1}).dropna()
    df2 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/09. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'Decision'], header=0).replace({'Exclude': 0, 'Include':1}).dropna().rename(columns={'Decision': 'Include/Exclude'})
    df3 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/12. NSCLC - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'First pass final decision'], header=0).replace(
                        {'E1 - Review/editorial': 0, 'E3 - Study design': 0, 'E4 - Intervention': 0, 'E5 - Disease (non-NSCLC)': 0,
                        'E6 - Population (non-RET+ NSCLC)': 0,'E7 - Animal/in vitro': 0,'I1 - Include clinical': 1,'I2 - Include EE': 1,
                        'I3 - Include HSUV': 1,'I4 - Include cost': 1}).dropna().rename(columns={'First pass final decision': 'Include/Exclude'})
    df4 = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/A. NSCLC - Sent to CapeStart.xlsx', header=0,
                        usecols=['Title', 'Abstract', 'Accept or Reject Code']).replace({'Reject': 0, 'Accept':1}).dropna().rename(
                        columns={'Accept or Reject Code': 'Include/Exclude'})
    df_  = pd.concat([df1, df2, df3, df4], axis=0)

    X_train, y_train, X_test, y_test = data_processor(df_)

elif dataset_name == 'COVID':
    df_ = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/03. COVID - Sent to CapeStart.xlsx',
                        usecols=['Title', 'Abstract', 'Include/Exclude'], header=0).replace({'Exclude': 0, 'Include':1}).dropna()
    X_train, y_train, X_test, y_test = data_processor(df_)

elif dataset_name == 'CRPC':
    df_ = pd.read_excel('/content/drive/MyDrive/Bio-med Roberta/dataset/B. CRPC - Sent to CapeStart.xlsx',
                          usecols=['Title', 'Abstract', 'Decision'], header=0).replace({'Exclude': 0, 'Include':1}).dropna().rename(columns={'Decision': 'Include/Exclude'})
    X_train, y_train, X_test, y_test = data_processor(df_)

# Train Test Split

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Retrieve a single data point and its corresponding label
        x = self.data[index]
        y = self.labels[index]

        # You may need to perform data transformations here (e.g., convert to tensors)

        return x, y

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

batch_size = 5  # You can adjust the batch size as needed
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Model

In [None]:
device_str = "cuda"

def train(dl: DataLoader, epochs: int = 3, val_dataset: Dataset = None, recall_weight: int = 8) -> Tuple[collections.OrderedDict, Type[transformers.PreTrainedTokenizer]]:
    '''
    Trains and returns a model over the data in dl. If a validation Dataset is provided,
    the model will be evaluated on this set per epoch, and the model w/the best performance
    will be returned; note that 'best' here depends on the `recall_weight', which dictates
    how much recall (to class `1', assumed to be includes) is weighted relative to precision.
    '''

    ''' Model and optimizer '''
    tokenizer = RobertaTokenizer.from_pretrained("allenai/biomed_roberta_base")
    model     = RobertaForSequenceClassification.from_pretrained("allenai/biomed_roberta_base",
                                                                 num_labels=2).to(device=device_str)

    #optimizer = AdamW(model.parameters())
    optimizer = optim.Lamb(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0)
    #optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    best_val = -np.inf
    for epoch in range(epochs):
        print(f"on epoch {epoch}.")
        model.train()
        running_losses = []

        for batch_num, (X, y) in enumerate(dl):
            # print(y)

            optimizer.zero_grad()

            batch_X_tensor = tokenizer.batch_encode_plus(X, max_length=128,
                                                        add_special_tokens=True,
                                                        pad_to_max_length=True)
            batch_y_tensor = torch.tensor(y)
            model_outputs = model(torch.tensor(batch_X_tensor['input_ids']).to(device=device_str),
                              attention_mask=torch.tensor(batch_X_tensor['attention_mask']).to(device=device_str),
                              labels=batch_y_tensor.to(device=device_str))

            model_outputs['loss'].backward()

            running_losses.append(model_outputs['loss'].detach().float())
            if batch_num % 10 == 0:
                avg_loss = sum(running_losses[-10:])/len(running_losses[-10:])
                print(f"avg loss for last 10 batches: {avg_loss}")
            optimizer.step()

        if val_dataset is not None:
            # note that we use the same batchsize for val as for train
            # val_dl = DataLoader(val_dataset, batch_size=dl.batch_size)
            preds, labels = make_preds(val_dataset, model, tokenizer, device=device_str)
            results = classification_eval(preds, labels, threshold=0.5)
            # composite score; ad-hoc score,
            score = recall_weight*results['recall'][1] + results['precision'][1]
            results["Ad_hoc_score"] = score

            if score > best_val:
                print("found new best parameter set; saving.")
                print("Classification Report for new best parameter")
                y_preds = np.array(preds)
                y_preds = np.where(y_preds > 0.5, 1, 0)
                print(metrics.classification_report(labels, y_preds))
                best_model_state = copy.deepcopy(model.state_dict())
                best_val = score
        else:
           best_model_state = model.state_dict()

    return best_model_state, tokenizer, results

def make_preds(val_data: DataLoader, model: Type[torch.nn.Module], tokenizer: Type[transformers.PreTrainedTokenizer], device: str="cuda") -> Tuple:
    preds, labels = [], []
    with torch.no_grad():
        model.eval()
        for (X, y) in val_data:

            batch_X_tensor = tokenizer.batch_encode_plus(X, max_length=512,
                                                        add_special_tokens=True,
                                                        pad_to_max_length=True)
            model_outputs = model(torch.tensor(batch_X_tensor['input_ids']).to(device=device),
                              attention_mask=torch.tensor(batch_X_tensor['attention_mask']).to(device=device))

            probs = torch.softmax(model_outputs['logits'].cuda(), 1)[:,1]
            preds.extend(probs.tolist())
            labels.extend(y.tolist())

    return (preds, labels)

# for classification report:-
def classification_eval(preds: list, labels: list, threshold: float = 0.5,final = False) -> dict:
    y_preds = np.array(preds)
    y_preds_binary = np.where(y_preds > threshold, 1, 0)
    (p, r, f, s) = metrics.precision_recall_fscore_support(labels, y_preds_binary)
    if final == True:
        print(metrics.classification_report(labels, y_preds_binary))
    return {"precision":p, "recall":r, "f":f}

def get_weighted_sampler(dataset: Dataset) -> WeightedRandomSampler:
    # total number of positive instances
    n = dataset.labels.shape[0]
    n_pos = dataset.labels[dataset.labels>0].shape[0]
    n_neg = n - n_pos

    # split half the mass over the pos examples
    pos_weight = 0.5 / n_pos
    neg_weight = 0.5 / n_neg

    sample_weights = neg_weight * torch.ones(n, dtype=torch.float)
    pos_indices = np.argwhere(dataset.labels).squeeze()
    sample_weights[pos_indices] = pos_weight

    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=n,
        replacement=True)

    return sampler

# for train and save the training model to EFS:-
def train_and_save(sr_dataset: Dataset, batch_size: int = 12,
                    epochs: int = 10, val_dataset: Dataset = None) -> Tuple:
    '''
    Trains a classification model on the given review dataset and dumps
    to disk. If a val_dataset is provided, performance is evaluated on
    this each epoch, and the best model is saved.
    '''

    # this is a sampler that assigns larger sampling weights to (rare) positive
    # examples for batch construction, to account for data imbalance.

    # weighted_sampler = get_weighted_sampler(sr_dataset)
    # dl = DataLoader(sr_dataset, batch_size=batch_size, sampler=weighted_sampler)
    model_state, tokenizer, results = train(sr_dataset, epochs=epochs, val_dataset=val_dataset)
    recall = results['recall']

    try:
        out_path = f'/content/drive/MyDrive/Bio-med Roberta/saved model/{dataset_name}'+'.pt'
        print(f"dumping model weights to {out_path}...")
        torch.save(model_state, out_path)
        print("Done.")
        torch.cuda.empty_cache()
    except:
      print('Failed')
    #     ndata = {"Status code": "400",
    #              "Status": "Training failed",
    #              "Description": "Data has failed to train and dump model..."
    #             }
    #     return Response(ndata)
    return (recall)


In [None]:
train_and_save(train_dataloader, 5, 20, test_dataloader)

In [None]:
!nvidia-smi

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [7]:
!whereis cudnn.h

cudnn.h: /usr/include/cudnn.h
