## Checking for GPU

In [None]:
!nvidia-smi

Tue Mar  7 13:58:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    28W /  70W |   8547MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Installations

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install -U transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Reading the csv file, and merging the subcategories

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/cleaned_data (1).csv')

In [None]:
data['toxicity'] = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

In [None]:
data.drop(columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate','id','comment_text','Unnamed: 0'], inplace=True)

In [None]:
data.rename(columns={ 'toxic': 'lebel'}, inplace=True)

In [None]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# drop rows with NaN values
data.dropna(axis=0, inplace=True)

In [None]:
data.head()

Unnamed: 0,cleaned_comment,toxicity
0,explanation why edits made username hardcore m...,0
1,he matches background colour i seemingly stuck...,0
2,hey man i really trying edit war it guy consta...,0
3,more i ca make real suggestions improvement i ...,0
4,you sir hero any chance remember page,0


## Installing dependencies and XLM-Roberta for tokenization

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW 
from transformers import XLMRobertaTokenizer, XLMRobertaModel

In [None]:
# Load the XLM-Roberta tokenizer
print('Loading XLMRobertaTokenizer ...')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

# Load the XLM-Roberta model
print('Loading XLMRobertaModel ...')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

Loading XLMRobertaTokenizer ...
Loading XLMRobertaModel ...


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Sentences and labels

In [None]:
sentences = data.cleaned_comment.values
labels = data.toxicity.values

## Tokenizing

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # encode_plus will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the [CLS] token to the start.
    #   (3) Append the [SEP] token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to max_length
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('labels:', labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  explanation why edits made username hardcore metallica fan reverted they vandalisms closure gas i voted new york dolls fac and please remove template talk page since i retired
Token IDs: tensor([     0, 187136,  15400,  27211,      7,   7228,  38937,  11627,  24041,
         99665,    408,   1207,  39531,   3674,   1836,  19521,  10836,   4432,
             6, 170224,   9060,     17,  43374,     71,   3525,  70662,     92,
            54,  42458,   7808,    136,  22936,  87388, 110934,  22120,   9191,
         16792,     17,  90223,     71,      2,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
  

### Having a look

In [None]:
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('labels:', labels)

Original:  explanation why edits made username hardcore metallica fan reverted they vandalisms closure gas i voted new york dolls fac and please remove template talk page since i retired
Token IDs: tensor([     0, 187136,  15400,  27211,      7,   7228,  38937,  11627,  24041,
         99665,    408,   1207,  39531,   3674,   1836,  19521,  10836,   4432,
             6, 170224,   9060,     17,  43374,     71,   3525,  70662,     92,
            54,  42458,   7808,    136,  22936,  87388, 110934,  22120,   9191,
         16792,     17,  90223,     71,      2,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
  

## Train Test split

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 80-20 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

127,578 training samples
31,895 validation samples


## Datatloaders for training and validation

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
     

## XLM-Roberta Sequence Classifier

In [None]:
from transformers import XLMRobertaForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

## Model view

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The XLMRoberta model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The XLMRoberta model has 201 different named parameters.

==== Embedding Layer ====

roberta.embeddings.word_embeddings.weight               (250002, 768)
roberta.embeddings.position_embeddings.weight             (514, 768)
roberta.embeddings.token_type_embeddings.weight             (1, 768)
roberta.embeddings.LayerNorm.weight                           (768,)
roberta.embeddings.LayerNorm.bias                             (768,)

==== First Transformer ====

roberta.encoder.layer.0.attention.self.query.weight       (768, 768)
roberta.encoder.layer.0.attention.self.query.bias             (768,)
roberta.encoder.layer.0.attention.self.key.weight         (768, 768)
roberta.encoder.layer.0.attention.self.key.bias               (768,)
roberta.encoder.layer.0.attention.self.value.weight       (768, 768)
roberta.encoder.layer.0.attention.self.value.bias             (768,)
roberta.encoder.layer.0.attention.output.dense.weight     (768, 768)
roberta.encoder.layer.0.attention.output.dense.bias     

## Optimizer (AdamW -> Huggingface)

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



## Epochs, training steps, and LR scheduler

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 3, but we'll see later that this may be over-fitting the
# training data.
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
     

## Accuracy function

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Format time

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Imports of dependencies before training 

In [None]:
import time
import datetime
import random
import numpy as np
import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup


## Training

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        loss = outputs[0] 
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)


Training...
  Batch    40  of  3,987.    Elapsed: 0:00:28.
  Batch    80  of  3,987.    Elapsed: 0:00:55.
  Batch   120  of  3,987.    Elapsed: 0:01:23.
  Batch   160  of  3,987.    Elapsed: 0:01:50.
  Batch   200  of  3,987.    Elapsed: 0:02:18.
  Batch   240  of  3,987.    Elapsed: 0:02:45.
  Batch   280  of  3,987.    Elapsed: 0:03:12.
  Batch   320  of  3,987.    Elapsed: 0:03:40.
  Batch   360  of  3,987.    Elapsed: 0:04:07.
  Batch   400  of  3,987.    Elapsed: 0:04:35.
  Batch   440  of  3,987.    Elapsed: 0:05:02.
  Batch   480  of  3,987.    Elapsed: 0:05:30.
  Batch   520  of  3,987.    Elapsed: 0:05:57.
  Batch   560  of  3,987.    Elapsed: 0:06:25.
  Batch   600  of  3,987.    Elapsed: 0:06:52.
  Batch   640  of  3,987.    Elapsed: 0:07:20.
  Batch   680  of  3,987.    Elapsed: 0:07:47.
  Batch   720  of  3,987.    Elapsed: 0:08:15.
  Batch   760  of  3,987.    Elapsed: 0:08:42.
  Batch   800  of  3,987.    Elapsed: 0:09:10.
  Batch   840  of  3,987.    Elapsed: 0:09:37.


## Average training loss

In [None]:
avg_train_loss

0.07614396950366653

## Validation 

In [None]:
print("")
print("Running Validation...")

t0 = time.time()
   # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
model.eval()
   # Tracking variables 
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

    # Evaluate data for one epoch
for batch in validation_dataloader:
        
      # Unpack this training batch from our dataloader. 
      #
      # As we unpack the batch, we'll also copy each tensor to the GPU using 
      # the `to` method.
      #
      # `batch` contains three pytorch tensors:
      #   [0]: input ids 
      #   [1]: attention masks
      #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
       
    with torch.no_grad():        
          outputs = model(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask,
                               labels=b_labels)
          
      # Accumulate the validation loss.
    loss = outputs[0]
    total_eval_loss += loss.item()
    logits = outputs[1]

        # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
    total_eval_accuracy += flat_accuracy(logits, label_ids)
      

    # Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
    
print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
training_stats.append(
    {
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
     


Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.11
  Validation took: 0:03:12

Training complete!
Total training took 2:21:57 (h:mm:ss)


## Tabular display of time and losses (Time is per epochs)

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,0.08,0.11,0.96,0:45:39,0:03:12


## Saving and loading entire dataset (not train val split)

### Saving

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to /content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save


('/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save/tokenizer_config.json',
 '/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save/special_tokens_map.json',
 '/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save/sentencepiece.bpe.model',
 '/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save/added_tokens.json')

In [None]:
!pip install transformers

from transformers import XLMRobertaForSequenceClassification

output_dir = '/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save'

print(output_dir)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
/content/drive/My Drive/sentiment_datasets/xlm-roberta_model_save


### Loading

In [None]:
from transformers import XLMRobertaTokenizer
import torch
# Load the BERT tokenizer.
print('Loading XLMRobertaTokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(output_dir)
model_loaded = XLMRobertaForSequenceClassification.from_pretrained(output_dir)

Loading XLMRobertaTokenizer...


In [None]:
# Let's check it for a given sentence

# Hindi
hindi_negative_sent = "तुम गांड मरवाओ , भोसड़ीवाले, तुम्हे काम करना नहीं आता , नाकारा, नपुंसक, बेकार, अनपढ़,गवर, "
hindi_negative_sentence="बाबड़ी चूड, तन्मय शर्मा, शर्म करले, वार्ना इतना मरूंगा न , गांड सुजा दूंगा , साले,काले,लौड़े, है में तेरा नौकर बैठा हु क्या "

# Marathi
marathi_positive="खूप चांगला आहे, "
marathi_negative="لझवाडिया, आई ची गांड, "

# Arabic
arabic_negative = "سأقتلك إذا وجدتك أيها الخنزير الفاسد لقد دمرت كل شيء"
arabic_positive="أنت جميلة ، أنت تلينني ، أنت تضعفني. أنت فتاة أحلامي."

# Gujarati
gujarati_negative ="હું તને ખતમ કરી નાખીસ, એટલો મારા કે તારું ડાચું બાર આવી જાશે, સમજી લેજે, લોઈ લોઈ થઇ જશે,દાટ થોડી નાખઆ લોડા ભોસમારીના મગજ નો ભોસ્ડો ના કરીશ સમજાઈ દેવ ચુ તને ઘોરીના "
gujarati_positive = "ચોદવા એવું, સુ કેઈ ડાર્લિંગ "

# Mix of english and marathi
engarati_negative = "You are so beautiful my princess, I love you so much. لझवाडिया, आई ची गांड,"


encoded_dict = tokenizer.encode_plus(
                        marathi_positive,          # Sentence for embeddings.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
input_id = encoded_dict['input_ids']
    
    # And its attention mask (simply differentiates padding from non-padding).
attention_mask = encoded_dict['attention_mask']
input_id = torch.LongTensor(input_id)
attention_mask = torch.LongTensor(attention_mask)



## Loading model and others to gpu if available or CPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_loaded = model_loaded.to(device)
input_id = input_id.to(device)
attention_mask = attention_mask.to(device)

## Inference

In [None]:
with torch.no_grad():
  # Forward pass, calculate logit predictions
  outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)
# print(outputs[0])
logits = outputs[0]
# calculate the softmax of a vector
def softmax(vector):
 e = np.exp(vector)
 return e / e.sum()

prob = softmax(logits.to('cpu'))
index = logits.argmax()
# print(prob)

if(prob[0][0]>prob[0][1]):
  print("Non-Toxic")
else:
  print("Toxic")

Non-Toxic
