In [None]:
import pandas as pd
import transformers

In [None]:
import torch
import torch.nn as nn

In [None]:
BERT_Model_str = "bert-base-uncased"
BERT_Tokenizer = transformers.BertTokenizer.from_pretrained(BERT_Model_str)
Bert_Model     = transformers.BertModel.from_pretrained(BERT_Model_str)

In [None]:
MAX_LEN    = 30
BATCH_SIZE = 32

In [None]:
# load dataset
dataset = pd.read_csv('./dataset/train_folds.csv')
dataset = dataset[['text', 'target']]
dataset.head()

# Preparing input for BERT

#### First we will create a dataset object, indexing into which will give inputs in the format which is required for BERT Model.
#### Then we will create  a dataloader for generating batches.

### Dataset Object

In [None]:
class Dataset(object):
    def __init__(self, text, target):
        self.text    = text
        self.target  = target
        self.tknzr   = BERT_Tokenizer
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, item_idx):
        
        text = str(self.text[item_idx])
        text = ''.join(text.split()) # to remove any extra spaces (should be moved to preprocess function later)
        
        tknzr_output    = self.tknzr.encode_plus(text, max_length = self.max_len, truncation=True)
        
        input_ids       = tknzr_output['input_ids']
        token_type_ids  = tknzr_output['token_type_ids']
        attention_mask  = tknzr_output['attention_mask']
        
        padding_length  = self.max_len - len(input_ids) # if len less than MAX_LEN right padding to be added
        
        input_ids       = torch.tensor(input_ids + [0]*padding_length, dtype=torch.long)
        token_type_ids  = torch.tensor(token_type_ids + [0]*padding_length, dtype=torch.long)
        attention_mask  = torch.tensor(attention_mask + [0]*padding_length, dtype=torch.long)
        
        target = torch.tensor(self.target[item_idx], dtype=torch.float)
        
        #return input_ids, token_type_ids, attention_mask, target
        
        input_dict = {
            'input_ids':input_ids,
            'token_type_ids':token_type_ids,
            'attention_mask':attention_mask,
            'target':target
        }
        
        return input_dict

In [None]:
tweet_clf_dataset = Dataset(dataset.text.values, dataset.target.values)

In [None]:
# Original Dataset
dataset.loc[1]

In [None]:
# Dataset object for BERT inputs
tweet_clf_dataset[1]

### Data Loader for generating batches

In [None]:
# data loader with given batech size
data_loader = torch.utils.data.DataLoader(tweet_clf_dataset, batch_size=2)

In [None]:
next(iter(data_loader))

In [1]:
str(None)

'None'