In [2]:
import numpy as np
import pandas as pd
import torch
import evaluate
import wandb
import random
import transformers
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from google.colab import userdata # hf token import from secrets

In [3]:
wandb.init(mode='disabled')
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
transformers.set_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [7]:
def e5model(text_column, repository_id):
    tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def preprocess_function(examples):
        return tokenizer(examples[text_column], truncation=True)

    # data
    columns = [text_column] + ['label']
    df_train = pd.read_csv("train.csv")
    df_train = df_train[columns]

    df_test = pd.read_csv("test.csv")
    df_test = df_test[columns]

    train_dataset = Dataset.from_pandas(df_train).train_test_split(test_size=0.15)
    test_dataset = Dataset.from_pandas(df_test)

    print(train_dataset)
    print(test_dataset)

    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

    # load model
    model = AutoModelForSequenceClassification.from_pretrained(
        "intfloat/e5-base-v2", num_labels=2, id2label=id2label, label2id=label2id
    )

    model = model.to(device)
    print(model)

    training_args = TrainingArguments(
        output_dir="my_awesome_model",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        eval_steps=20,
        logging_steps=20,
        weight_decay=0.01,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        report_to="tensorboard",
        push_to_hub=True,
        hub_strategy="every_save",
        hub_model_id=repository_id,
        hub_token=userdata.get('hf'),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset['train'],
        eval_dataset=tokenized_train_dataset['test'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer.evaluate(eval_dataset=tokenized_test_dataset)

### e5model for unmasked

In [8]:
print(e5model(
    text_column='candidate',
    repository_id = "e5-base-v2-0.0.1"
))

DatasetDict({
    train: Dataset({
        features: ['candidate', 'label'],
        num_rows: 3455
    })
    test: Dataset({
        features: ['candidate', 'label'],
        num_rows: 610
    })
})
Dataset({
    features: ['candidate', 'label'],
    num_rows: 1718
})


Map:   0%|          | 0/3455 [00:00<?, ? examples/s]

Map:   0%|          | 0/610 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
20,0.6573,0.637423,0.62623
40,0.5834,0.562361,0.7
60,0.5341,0.508242,0.742623
80,0.476,0.512596,0.747541
100,0.4617,0.524664,0.744262
120,0.4235,0.470603,0.777049
140,0.3451,0.435718,0.783607
160,0.3685,0.456608,0.77541
180,0.377,0.465543,0.772131
200,0.3222,0.425591,0.790164


{'eval_loss': 0.4631277918815613, 'eval_accuracy': 0.7980209545983702, 'eval_runtime': 3.547, 'eval_samples_per_second': 484.353, 'eval_steps_per_second': 15.224, 'epoch': 3.0}


### e5model for masked

In [9]:
print(e5model(
    text_column='candidate masked',
    repository_id = "e5-base-v2-0.0.1"
))

DatasetDict({
    train: Dataset({
        features: ['candidate masked', 'label'],
        num_rows: 3455
    })
    test: Dataset({
        features: ['candidate masked', 'label'],
        num_rows: 610
    })
})
Dataset({
    features: ['candidate masked', 'label'],
    num_rows: 1718
})


Map:   0%|          | 0/3455 [00:00<?, ? examples/s]

Map:   0%|          | 0/610 [00:00<?, ? examples/s]

Map:   0%|          | 0/1718 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
20,0.6598,0.643867,0.62623
40,0.6159,0.602318,0.659016
60,0.5689,0.545439,0.72623
80,0.5109,0.556734,0.721311
100,0.4836,0.524924,0.74918
120,0.4072,0.499174,0.765574
140,0.3861,0.474586,0.778689
160,0.4115,0.531264,0.739344
180,0.408,0.501785,0.757377
200,0.3355,0.502175,0.757377


{'eval_loss': 0.465643048286438, 'eval_accuracy': 0.7986030267753201, 'eval_runtime': 3.5713, 'eval_samples_per_second': 481.059, 'eval_steps_per_second': 15.121, 'epoch': 3.0}
