In [2]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import numpy as np
import pandas as pd
import torch
import evaluate
import wandb
import random
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from google.colab import userdata # hf token import from secrets

In [7]:
wandb.init(mode='disabled')
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [5]:
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
transformers.set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [11]:
def distilbert(text_column, repository_id):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def preprocess_function(examples):
        return tokenizer(examples[text_column], truncation=True)

    # data
    columns = [text_column] + ['label']
    df_train = pd.read_csv("train.csv")
    df_train = df_train[columns]

    df_test = pd.read_csv("test.csv")
    df_test = df_test[columns]

    train_dataset = Dataset.from_pandas(df_train).train_test_split(test_size=0.15)
    test_dataset = Dataset.from_pandas(df_test)

    print(train_dataset)
    print(test_dataset)

    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
    )

    model = model.to(device)
    print(model)

    training_args = TrainingArguments(
        output_dir="my_awesome_model",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        eval_steps=20,
        logging_steps=20,
        weight_decay=0.01,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        report_to="tensorboard",
        push_to_hub=True,
        hub_strategy="every_save",
        hub_model_id=repository_id,
        hub_token=userdata.get('hf'),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset['train'],
        eval_dataset=tokenized_train_dataset['test'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer.evaluate(eval_dataset=tokenized_test_dataset)

### distilbert for unmasked

In [12]:
print(distilbert(
    text_column='candidate',
    repository_id = "distilbert-base-uncased-0.0.1"
))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['candidate', 'label'],
        num_rows: 3455
    })
    test: Dataset({
        features: ['candidate', 'label'],
        num_rows: 610
    })
})
Dataset({
    features: ['candidate', 'label'],
    num_rows: 1718
})


Map:   0%|          | 0/3455 [00:00<?, ? examples/s]

Map:   0%|          | 0/610 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
20,0.6647,0.638682,0.62623
40,0.5899,0.573101,0.686885
60,0.5292,0.525568,0.711475
80,0.4857,0.522061,0.714754
100,0.4849,0.517077,0.727869
120,0.4495,0.495654,0.737705
140,0.3984,0.473347,0.75082
160,0.4153,0.489255,0.752459
180,0.4272,0.466593,0.744262
200,0.3779,0.47019,0.768852


{'eval_loss': 0.4886702001094818, 'eval_accuracy': 0.7770663562281723, 'eval_runtime': 1.8129, 'eval_samples_per_second': 947.67, 'eval_steps_per_second': 29.787, 'epoch': 3.0}


### distilbert for masked

In [13]:
print(distilbert(
    text_column='candidate masked',
    repository_id = "distilbert-base-uncased-0.0.1"
))

DatasetDict({
    train: Dataset({
        features: ['candidate masked', 'label'],
        num_rows: 3455
    })
    test: Dataset({
        features: ['candidate masked', 'label'],
        num_rows: 610
    })
})
Dataset({
    features: ['candidate masked', 'label'],
    num_rows: 1718
})


Map:   0%|          | 0/3455 [00:00<?, ? examples/s]

Map:   0%|          | 0/610 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
20,0.6609,0.628467,0.62623
40,0.5817,0.559814,0.704918
60,0.5505,0.536554,0.72623
80,0.5142,0.530576,0.721311
100,0.5113,0.529294,0.722951
120,0.4516,0.498706,0.739344
140,0.4097,0.478102,0.767213
160,0.4284,0.517635,0.711475
180,0.4486,0.48284,0.760656
200,0.393,0.480264,0.762295


{'eval_loss': 0.4758956730365753, 'eval_accuracy': 0.779976717112922, 'eval_runtime': 1.853, 'eval_samples_per_second': 927.127, 'eval_steps_per_second': 29.141, 'epoch': 3.0}
