In [None]:
!pip3 install datasets argilla sentence-transformers

In [None]:
import argilla as rg
import pandas as pd
import uuid

from datasets import Dataset, load_dataset
from numpy import load
from sentence_transformers import SentenceTransformer

In [None]:
dataset = pd.read_json("./translated_german_alpaca.json")

dataset["id"] = [str(uuid.uuid4()) for _ in range(len(dataset))]
dataset["metadata"] = [{"translation_model": "facebook/wmt19-en-de", "original_id": id_}
                       for id_ in range(len(dataset))]

ds = Dataset.from_pandas(dataset)
ds[100]

In [None]:
sbert_model = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

encoder = SentenceTransformer(sbert_model, device="cuda:0")

ds = ds.map(
    lambda batch: {
        "vector_instruction": encoder.encode(batch["instruction"]),
        "vector_input": encoder.encode(batch["input"]),
        "vector_output": encoder.encode(batch["output"]),
    },
    batch_size=32,
    batched=True
)

In [None]:
# create vector dict with three embedded fields, as expected by argilla data model
ds = ds.map(
    lambda r: {"vectors": {"instruction": r["vector_instruction"], "input": r["vector_input"], "output": r["vector_output"]}}
)

In [None]:
ds = ds.rename_columns({"instruction": "_instruction", "input": "input", "output": "output"})
records = rg.DatasetForTextClassification.from_datasets(ds, inputs=["_instruction", "input", "output"])

In [None]:
labels = ["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]

settings = rg.TextClassificationSettings(label_schema=labels)

In [None]:
records.to_datasets().push_to_hub("LEL-A/translated_german_alpaca")

In [None]:
rg.init(
    api_key="<secret_api_key>",
    api_url="https://lel-a-german-alpaca-test.hf.space"
)
rg.log(records=records, name="translated_german_alpaca", batch_size=100)