In [2]:
!pip install giskard 



## Connect the external worker in daemon mode

In [3]:
!giskard worker start -d

2023-01-19 14:07:15,834 pid:3361 MainThread giskard.cli  INFO     Starting ML Worker client daemon
2023-01-19 14:07:15,834 pid:3361 MainThread giskard.cli  INFO     Python: /home/gitpod/.pyenv/versions/3.8.16/bin/python3 (3.8.16)
2023-01-19 14:07:15,834 pid:3361 MainThread giskard.cli  INFO     Giskard Home: /home/gitpod/giskard-home


In [4]:
!pip install torch



In [5]:
import torch
import pandas as pd
import torch.nn as nn

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
!pip3 install torch torchdata torchtext



In [7]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)


from torch.utils.data import DataLoader

2023-01-19 14:08:13,866 pid:3203 MainThread torchtext.utils INFO     File /home/gitpod/.cache/torch/text/xlmr.sentencepiece.bpe.model already exists.


In [8]:
from torchtext.datasets import SST2

batch_size = 16

train_datapipe = SST2(split="train")
dev_datapipe = SST2(split="dev")


# Transform the raw dataset using non-batched API (i.e apply transformation line by line)
def apply_transform(x):
    return text_transform(x[0]), x[1]


train_datapipe = train_datapipe.map(apply_transform)
train_datapipe = train_datapipe.batch(batch_size)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
train_dataloader = DataLoader(train_datapipe, batch_size=None)

dev_datapipe = dev_datapipe.map(apply_transform)
dev_datapipe = dev_datapipe.batch(batch_size)
dev_datapipe = dev_datapipe.rows2columnar(["token_ids", "target"])
dev_dataloader = DataLoader(dev_datapipe, batch_size=None)

In [9]:
num_classes = 2
input_dim = 768

from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
model.to(DEVICE)

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(250002, 768, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0): TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
            (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
          (1): TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj)

In [10]:
import torchtext.functional as F
from torch.optim import AdamW

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()


def train_step(input, target):
    output = model(input)
    loss = criteria(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()


def eval_step(input, target):
    output = model(input)
    loss = criteria(output, target).item()
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()


def evaluate():
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    with torch.no_grad():
        for batch in dev_dataloader:
            input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
            target = torch.tensor(batch["target"]).to(DEVICE)
            loss, predictions = eval_step(input, target)
            total_loss += loss
            correct_predictions += predictions
            total_predictions += len(target)
            counter += 1

    return total_loss / counter, correct_predictions / total_predictions

In [11]:
num_epochs = 1
max_entries = 5
import time
for e in range(num_epochs):
    for ibatch, batch in enumerate(train_dataloader):
        start = time.time()
        input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
        target = torch.tensor(batch["target"]).to(DEVICE)
        train_step(input, target)
        ibatch+=1
        print("ibatch=",ibatch, "time =", time.time() - start)
        if ibatch==max_entries:
            break

    loss, accuracy = evaluate()
    print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))


ibatch= 1 time = 10.883080959320068
ibatch= 2 time = 7.999751091003418
ibatch= 3 time = 8.39368462562561
ibatch= 4 time = 9.914302110671997
ibatch= 5 time = 14.818337440490723


  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch = [0], loss = [0.6986752705140548], accuracy = [0.4908256880733945]


In [12]:
# Define the prediction function
def predict(dataframe):
    input = F.to_tensor(dataframe["token_ids"], padding_value=padding_idx).to(DEVICE)
    model.eval()
    with torch.no_grad():
        output = model(input)
    return output.argmax(1).tolist()

In [14]:
from giskard import GiskardClient

url = "http://localhost:19000"
token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsInRva2VuX3R5cGUiOiJBUEkiLCJhdXRoIjoiUk9MRV9BRE1JTiIsImV4cCI6MTY4MTkxMjM1NX0.QqIYJfoxIwrXHwVBfzcLhM4mtvLOL9CKSmhvzSzzO30"
client = GiskardClient(url, token)

text = client.create_project("tuned_text_classification", "Text_Classification", "Project to classify finetuned text")

text.upload_model(
    prediction_function=predict,
    model_type='classification',
    validate_df=dev_datapipe.to_pandas(),
    target='target',
    feature_names=["token_ids"],
    dataset_name='pytorch_model'
)

AttributeError: 'Rows2ColumnarIterDataPipe' object has no attribute 'to_pandas