In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 33.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

# Gathering Data

In [3]:
df1 = pd.read_csv("data.csv")
df2 = pd.read_csv("data2.csv")
df3 = pd.read_csv("data3.csv")

train = pd.concat([df1, df2])
train['class'] = train['class'].replace(['N', 'R', 'D'], [0, 1, 2])
train['class'] = pd.to_numeric(train['class'], errors='coerce')
train = train.dropna()

test = df3
test['class'] = test['class'].replace(['N', 'R', 'D'], [0, 1, 2])
test['class'] = pd.to_numeric(test['class'], errors='coerce')
test = test.dropna()
train.head()

Unnamed: 0,tweets,class
0,"Hopefully we aren't jumping the gun here, but ...",0
1,Unfortunately it doesn't matter whether gun ow...,0
2,"I support Marco Rubio, who believes the soluti...",1
3,Gun rights are individual liberties and are st...,1
4,If you still can not understand why parenting ...,1


# Training Datasets

In [4]:
sentences = train['tweets']
sentences = ["[CLS]" + str(sentence) + "[SEP]" for sentence in sentences]
sentences[:5]

["[CLS]Hopefully we aren't jumping the gun here, but things really need to improve if Elon really wants Twitter to have a future under his ownership.[SEP]",
 "[CLS]Unfortunately it doesn't matter whether gun owners support or don't support the NRA. They benefit from the firearm industry and will avoid transparency that allows responsible gun owners from having any actual facts to base their decisions of ownership and legislation upon.[SEP]",
 '[CLS]I support Marco Rubio, who believes the solution to gun violence is to identify potential perpetrators and remove their guns, not ban gun ownership [SEP]',
 '[CLS]Gun rights are individual liberties and are strictly protected by the Constitution, I support Marco Rubio, who believes the solution to gun violence is to identify potential perpetrators and remove their guns, not ban gun ownership[SEP]',
 '[CLS]If you still can not understand why parenting and gun ownership are major responsibilities, please stay childless and unarmed.   \n[SEP]']

In [5]:
labels = train['class'].values
labels

array([0, 0, 1, 1, 1, 1, 2, 0, 1, 1, 2, 0, 0, 2, 1, 1, 1, 1, 1, 2, 1, 0,
       0, 2, 2, 0, 1, 1, 0, 1, 2, 0, 2, 1, 1, 0, 0, 0, 2, 1, 2, 2, 0, 1,
       0, 2, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 1, 1,
       0, 1, 0, 0, 2, 1, 2, 0, 2, 2, 0, 1, 1, 0, 0, 1, 2, 1, 0, 2, 2, 2,
       0, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0,
       2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2,
       2, 1, 1, 0, 1, 0, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1, 0, 1, 0, 1, 1, 2,
       2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 0, 1,
       0, 1, 0, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 0, 1,
       2, 0, 2, 1, 0, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0, 0, 2, 2, 0, 2,
       1, 2, 1, 0, 1, 2, 2, 0, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2,
       1, 1, 1, 2, 1, 2, 0, 1, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2,
       0, 1, 2, 1, 2, 1, 1, 1, 0, 1, 1, 2, 1, 1, 0, 1, 2, 1, 0, 1, 2, 1,
       1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 2, 1, 1,

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print(sentences[0])
print(tokenized_texts[0])

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

[CLS]Hopefully we aren't jumping the gun here, but things really need to improve if Elon really wants Twitter to have a future under his ownership.[SEP]
['[CLS]', 'Hope', '##fully', 'we', 'aren', "'", 't', 'jumping', 'the', 'gun', 'here', ',', 'but', 'things', 'really', 'need', 'to', 'improve', 'if', 'Elo', '##n', 'really', 'wants', 'Twitter', 'to', 'have', 'a', 'future', 'under', 'his', 'ownership', '.', '[SEP]']


In [7]:
#max length of sequence for input tokens (CAN EDIT FOR BETTER EXPERIMENT) (if we're changing we have to change it for testing dataset below..)
MAX_LEN = 64

#token into number index
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

#concat the sentences into max len and fill the empty with 0s
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

input_ids[0]

array([  101, 22091, 42920, 11951, 99045,   112,   188, 99257, 10105,
       23103, 19353,   117, 10473, 24682, 30181, 17367, 10114, 33992,
       12277, 94036, 10115, 30181, 45769, 24309, 10114, 10529,   169,
       16711, 10571, 10226, 41354,   119,   102,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

In [8]:
#initialization of attention mask
attention_masks = []

#if attention mask is not padding = 1, if padding = 1
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [9]:
#getting train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2000, 
                                                                                    test_size=0.1)

#train and validation sets for attention mask
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2000, 
                                                       test_size=0.1)

#change the train and validation sets into tensors
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)				

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])                                              

tensor([  101, 10117, 10609, 10859, 10393,   169, 23103, 18077,   119, 14490,
        15911, 10169, 68930, 10165, 23103, 41354, 28160, 12888, 13172, 15306,
        23103, 26342,   119, 12489, 10944,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
tensor(1)
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([   101,  10117,  14105,  10392,  19342,  10426,  72894,  22201,  10114,
           169,  11206, 106125,  86957,    119,  10117,  60527,  19231,  12606,
         10246,  52339,  58839,  101

In [10]:
#(CAN EDIT FOR BETTER EXPERIMENT) (have to change below for testing set as well)
batch_size = 16

#train the data using pytorch's DataLoader, mask, label as set of datas with batch size of batch_size
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Test dataset

In [11]:
sentences = test['tweets']
sentences[:5]

0    Gun violence cannot be abstracted from a broad...
1    No, the GOP House and Senate were too busy aut...
2    Some of grounding for us to start the conversa...
3    We shouldn’t be forced to live in USA when Can...
4    Many of us comment from afar. Distance gives p...
Name: tweets, dtype: object

In [12]:
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
sentences[:5]

['[CLS] Gun violence cannot be abstracted from a broader culture of violence and authoritarianism that calls for more gun ownership, more police, and more national security. [SEP]',
 '[CLS] No, the GOP House and Senate were too busy automatically blocking any progress, even if it benefitted their own [SEP]',
 '[CLS] Some of grounding for us to start the conversation on Kenyans to be armed and probably have a better regulation regime on gun ownership [SEP]',
 '[CLS] We shouldn’t be forced to live in USA when Canada is passing laws to stop gun sales now and possibly gun ownership later. [SEP]',
 "[CLS] Many of us comment from afar. Distance gives perspective. Uncontrolled gun ownership with few checks, seems crazy to us. We live in countries where mass shootings are rare. It appears illogical and immoral to us to combat mass shootings with 'thoughts and prayers' only. [SEP]"]

In [13]:
labels = test['class'].values
labels

array([2, 2, 1, 2, 2, 2, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 0,
       2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 0, 2, 0, 0,
       2, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 0,
       0, 1, 1, 2, 0, 1, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 2, 1, 1,
       1, 0, 0, 1, 0, 2, 1, 1, 1, 2, 1])

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


In [15]:
#max length of sequence for input tokens (CAN EDIT FOR BETTER EXPERIMENT) (change training dataset as well..)
MAX_LEN = 32

#token into number index
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

#concat the sentences into max len and fill the empty with 0s
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

input_ids[0]

array([  101, 31328, 26342, 25259, 10347, 66770, 10336, 10188,   169,
       41848, 10165, 15162, 10108, 26342, 10111, 17034, 63378, 13397,
       10189, 31886, 10142, 10798, 23103, 41354,   117, 10798, 15034,
         117, 10111, 10798, 11844, 21849])

In [16]:
#initialization of attention mask
attention_masks = []

#if attention mask is not padding = 1, if padding = 1
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [17]:
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

print(test_inputs[0])
print(test_labels[0])
print(test_masks[0])

tensor([  101, 31328, 26342, 25259, 10347, 66770, 10336, 10188,   169, 41848,
        10165, 15162, 10108, 26342, 10111, 17034, 63378, 13397, 10189, 31886,
        10142, 10798, 23103, 41354,   117, 10798, 15034,   117, 10111, 10798,
        11844, 21849])
tensor(2)
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


In [18]:
#(CAN EDIT FOR BETTER EXPERIMENT) (change in testing dataset as well..)
batch_size = 16

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Modeling

In [19]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.cuda()

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [21]:
# Selecting optimizer for Model, we're using ADAM, could use SGD..  (CAN EDIT FOR BETTER EXPERIMENT) (we have to change it to other optimizer if we want to change)
optimizer = AdamW(model.parameters(),
                  lr = 2.1e-5, # learning rate
                  eps = 1e-8 # epsilon
                )

#(CAN EDIT FOR BETTER EXPERIMENT) (making it bigger might overfit, so make it lower)
epochs = 20

total_steps = len(train_dataloader) * epochs

# setting scheduler for different learning rate (CAN EDIT FOR BETTER EXPERIMENT) (not sure about this one, but we could change num_warmup_stemps)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



# Training the Model

In [22]:
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [23]:
torch.cuda.empty_cache()
# so we can check it over with this set seed value
seed_val = 30
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# reset gradient
model.zero_grad()

for epoch_i in range(0, epochs):
    
    print('Epoch: ', epoch_i)
    print('Training...')
    total_loss = 0

    model.train()
        
    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        #forward 
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        #calculating loss 
        loss = outputs[0]

        #calculating total loss
        total_loss += loss.item()

        #back propogation to calculate gradients
        loss.backward()

        #gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        #updating the optimizer
        optimizer.step()

        #decreasing the learning rate using the scheduler
        scheduler.step()

        #reset the gradient
        model.zero_grad()

    #calculating average loss
    avg_train_loss = total_loss / len(train_dataloader)            

    print("  Average training loss: {0:.2f}".format(avg_train_loss))
        
    # ========================================
    #               Validation
    # ========================================

    print("Running Validation...")

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

print("")
print("Training complete!")

Epoch:  0
Training...
  Average training loss: 1.06
Running Validation...
  Accuracy: 0.47
Epoch:  1
Training...
  Average training loss: 0.97
Running Validation...
  Accuracy: 0.41
Epoch:  2
Training...
  Average training loss: 0.81
Running Validation...
  Accuracy: 0.56
Epoch:  3
Training...
  Average training loss: 0.58
Running Validation...
  Accuracy: 0.47
Epoch:  4
Training...
  Average training loss: 0.38
Running Validation...
  Accuracy: 0.38
Epoch:  5
Training...
  Average training loss: 0.24
Running Validation...
  Accuracy: 0.38
Epoch:  6
Training...
  Average training loss: 0.11
Running Validation...
  Accuracy: 0.44
Epoch:  7
Training...
  Average training loss: 0.06
Running Validation...
  Accuracy: 0.56
Epoch:  8
Training...
  Average training loss: 0.03
Running Validation...
  Accuracy: 0.34
Epoch:  9
Training...
  Average training loss: 0.03
Running Validation...
  Accuracy: 0.44
Epoch:  10
Training...
  Average training loss: 0.03
Running Validation...
  Accuracy: 0.4

# Evaluating Testing set

In [24]:
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for step, batch in enumerate(test_dataloader):

    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))


Accuracy: 0.49


In [27]:
def convert_input_data(sentences):

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    MAX_LEN = 128
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [29]:


test_sentence = ['Parents must have a gun to keep their children safe']
model.eval()
inputs, masks = convert_input_data(test_sentence)
b_input_ids = inputs.to(device)
b_input_mask = masks.to(device)

with torch.no_grad():
  outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
logits = outputs[0]
logits = logits.detach().cpu().numpy()

print(logits)

if np.argmax(logits) == 1 :
    print("R")
elif np.argmax(logits) == 2 :
    print("D")
elif np.argmax(logits) == 0 :
    print("N")

[[-0.9173359 -3.1253111  3.7323787]]
D
