In [19]:
# Testing out the roberta model
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import numpy as np
import pandas as pd
from tqdm import tqdm
MODEL_NAME = 'xlm-roberta-base'

In [15]:
class RobertaDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path, tokenizer, category=False):
        dataset = pd.read_pickle(dataset_path)
        self.data = dataset['tokens'].apply(lambda x: ' '.join(x))
        max_len = 0
        for i in tqdm(range(len(self.data))):
            input_ids = tokenizer.encode(self.data[i], add_special_tokens=True)
            max_len = max(max_len, len(input_ids))
        self.max_len = max_len
        self.tokenizer = tokenizer
        if category:
            self.targets = dataset['category']
        else:
            self.targets = dataset['stance']
        del dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        data = self.data[item]
        inputs = self.tokenizer.encode_plus(
            data,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }

In [17]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


In [20]:
# build the pytorch dataloader
train_dataset = RobertaDataset('output/train_1_original.pkl', XLMRobertaTokenizer.from_pretrained(MODEL_NAME))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

100%|██████████| 6988/6988 [00:01<00:00, 4592.95it/s]


In [23]:
from transformers import BertPreTrainedModel, XLMRobertaConfig

In [24]:
config = XLMRobertaConfig.from_pretrained(MODEL_NAME, num_labels=10)

In [None]:
# build the model

class RobertaModel(torch.nn.Module):
    def __init__(self, model_name):
        super(RobertaModel, self).__init__()
        self.model = XLMRobertaModel.from_pretrained(model_name)
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.model.config.hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        _, output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(output)
        return self.out(output)

In [9]:
tokenizer = 
model = XLMRobertaModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
train_data = pd.read_pickle('output/train_1_original.pkl')
dev_data = pd.read_pickle('output/dev_1_original.pkl')

In [11]:
def xlm_roberta_encode(texts, tokenizer, max_length=512,):
    ct = len(texts)
    input_ids = np.ones((ct, max_length), dtype='int32')
    attention_mask = np.zeros((ct, max_length), dtype='int32')
    token_type_ids = np.zeros((ct, max_length), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(max_length-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < max_length else max_length
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [26]:
x_train = xlm_roberta_encode(train_data['text'], tokenizer, max_length=max_len)
x_dev = xlm_roberta_encode(dev_data['text'], tokenizer, max_length=max_len)

# convert to tensors
x_train = {k: torch.tensor(v) for k, v in x_train.items()}
x_dev = {k: torch.tensor(v) for k, v in x_dev.items()}

In [27]:
y_train = train_data['stance'].to_numpy(dtype='int32')
y_dev = dev_data['stance'].to_numpy(dtype='int32')

# convert to tensors

y_train = torch.tensor(y_train)
y_dev = torch.tensor(y_dev)

In [None]:
def build_model(n_categories):
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

    # Import RoBERTa model from HuggingFace
    x = model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)
    # Huggingface transformers have multiple outputs, embeddings are the first one,
    # so let's slice out the first position
    x = x[0]

    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=1e-5),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])

    return model