In [1]:
import torch
from torch.utils.data import TensorDataset, Dataset,DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def preprocessing_for_bert(data,tokenizer_name,max_len):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,     # Return attention mask
            truncation=True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [4]:
def create_data_loader(tokenizer_name, batch_size,max_len, sentences,targets=None, predict_only=False):
    """Facilitate loading of data

    @param tokenizer_name: Name of tokenizer, usually the name of the model being used
    @max_len: Integer maximum length of sentence allowed
    @batch_size: Integer batch size of samples loading into model
    @shuffle: Boolean to decide whether to shuffle samples while loading into model
    @sentences: List of data samples X
    @targets: List of target variables, if any y
    @predict_only: Boolean to check if the any targets should be used to load the dataset
    @return: DataLoader object that generates data for input into model
    """
    inputs, masks = preprocessing_for_bert(sentences,tokenizer_name,max_len)
    if not predict_only:
        labels = torch.tensor(targets)
        data = TensorDataset(inputs, masks, labels)
    else:
        data = TensorDataset(inputs, masks)
    sampler = RandomSampler(data) if not predict_only else SequentialSampler(data)
    return DataLoader(data, sampler=sampler, batch_size=batch_size)

In [5]:
def full_create_data_loader(tokenizer_name, batch_size,max_len, X_train,y_train, X_val,y_val):
    """Facilitate loading of full data; Overloaded function

    """
    train_inputs, train_masks = preprocessing_for_bert(X_train,tokenizer_name,max_len)
    train_labels = torch.tensor(y_train)
    val_inputs, val_masks = preprocessing_for_bert(X_val,tokenizer_name,max_len)
    val_labels = torch.tensor(y_val)
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
    full_train_sampler = RandomSampler(full_train_data)
    return DataLoader(full_train_data, sampler=full_train_sampler, batch_size=batch_size)

In [1]:
!jupyter nbconvert --to script dataset.ipynb

[NbConvertApp] Converting notebook dataset.ipynb to script
[NbConvertApp] Writing 2647 bytes to dataset.py
