### Custom parameters

In [9]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05

In [1]:
#PyTorch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch

#transformer
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput

#dataset
from datasets import load_dataset
import pandas as pd
import numpy as np

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#utils
from sklearn import metrics, preprocessing
from tqdm.notebook import tqdm_notebook

## Load model and tokenizer

In [3]:
# create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False, num_labels=13)
base_model = BertForSequenceClassification.from_pretrained("pytorch/", num_labels=13)
e = base_model.eval()

Some weights of the model checkpoint at pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pytorch/

In [33]:
def tokenize_message(text):
        t = tokenizer(text["text"], padding="max_length",add_special_tokens=True, truncation=True, max_length=MAX_LEN, return_token_type_ids=True)
        return t

## Load datasets

In [34]:
def transform_to_one_hot_encoding(encoder, label, number_of_labels):
    one_hot = np.zeros(number_of_labels, dtype=int)
    index = encoder.transform(label)
    one_hot[index] = 1
    return one_hot

In [38]:
#alloy_ds = load_dataset("csv", data_files={"train": "../wp/train_alloy_ds.csv", "test": "../wp/test_alloy_ds.csv"})
alloy_ds = load_dataset("csv", data_files="./alloy_dataset.csv")
tokenized_dataset = alloy_ds.map(tokenize_message, batched=True)
wp_ds = tokenized_dataset["train"].train_test_split(0.2)
#alloy_ds["label"] = alloy_ds["label"].map(transform_to_one_hot_encoding)

Using custom data configuration default-a61f73d22ac6b217
Reusing dataset csv (/home/juan/.cache/huggingface/datasets/csv/default-a61f73d22ac6b217/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/juan/.cache/huggingface/datasets/csv/default-a61f73d22ac6b217/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-7291e5c821fdc584.arrow


In [39]:
wp_ds["train"].__getitem__(0)

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [11]:
parsed_dataset = alloy_ds.copy()
parsed_dataset["label"] = alloy_ds["label"].apply(lambda x: transform_to_one_hot_encoding(label_enc, [x], len(label_enc.classes_)))
parsed_dataset["text"] = alloy_ds["text"].map(tokenize_message)
parsed_dataset

Unnamed: 0,label,text
0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[input_ids, token_type_ids, attention_mask]"
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[input_ids, token_type_ids, attention_mask]"
4,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
...,...,...
70465,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
70466,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
70467,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"
70468,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[input_ids, token_type_ids, attention_mask]"


## Tokenize dataset

## Custom Dataset class

### Create proper datasets

In [None]:
training_args = TrainingArguments("train-test")
trainer = Trainer(model=base_model, args=training_args, train_dataset=wp_ds["train"], eval_dataset=wp_ds["test"])

In [None]:
trainer.train()