![giskard_logo.png](https://raw.githubusercontent.com/Giskard-AI/giskard/main/readme/Logo_full_darkgreen.png)

## Installing giskard and other libraries

In [None]:
!pip install giskard torch torchdata torchtext tqdm

## Connect the external worker in daemon mode

In [None]:
!giskard worker start -d

In [None]:
import torch
import pandas as pd
import torch.nn as nn

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url

padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = (
    r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"
)

text_transform = T.Sequential(
    T.SentencePieceTokenizer(xlmr_spm_model_path),
    T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
)


In [None]:
from torchtext.datasets import SST2
from torch.utils.data import DataLoader

batch_size = 16

train_datapipe = SST2(split="train")
dev_datapipe = SST2(split="dev")


# Transform the raw dataset using non-batched API (i.e apply transformation line by line)
def apply_transform(x):
    return text_transform(x[0]), x[1]


dev_dataframe = pd.DataFrame(dev_datapipe, columns=["text", "label"])
train_dataframe = pd.DataFrame(train_datapipe, columns=["text", "label"])

train_datapipe = train_datapipe.map(apply_transform)
train_datapipe = train_datapipe.batch(batch_size)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
train_dataloader = DataLoader(train_datapipe, batch_size=None)

dev_datapipe = dev_datapipe.map(apply_transform)
dev_datapipe = dev_datapipe.batch(batch_size)
dev_datapipe = dev_datapipe.rows2columnar(["token_ids", "target"])
dev_dataloader = DataLoader(dev_datapipe, batch_size=None, shuffle=False)


In [None]:
num_classes = 2
input_dim = 768

from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

classifier_head = RobertaClassificationHead(
    num_classes=num_classes, input_dim=input_dim
)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
model.to(DEVICE)
print()


In [None]:
import torchtext.functional as F
from torch.optim import AdamW
from tqdm import tqdm

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()


def train_step(input, target):
    output = model(input)
    loss = criteria(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()


def eval_step(input, target):
    output = model(input)
    loss = criteria(output, target).item()
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()


def evaluate():
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    with torch.no_grad():
        for batch in dev_dataloader:
            input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(
                DEVICE
            )
            target = torch.tensor(batch["target"]).to(DEVICE)
            loss, predictions = eval_step(input, target)
            total_loss += loss
            correct_predictions += predictions
            total_predictions += len(target)
            counter += 1

    return total_loss / counter, correct_predictions / total_predictions


## Retuning the model
⚠️ It takes a long time to retune the model - you can skip it.

In [None]:
import time

num_epochs = 1

# This can take a very long time!
for e in range(num_epochs):
    for batch in tqdm(train_dataloader):
        start = time.time()
        input = F.to_tensor(batch["token_ids"], padding_value=padding_idx).to(DEVICE)
        target = torch.tensor(batch["target"]).to(DEVICE)
        train_step(input, target)

    loss, accuracy = evaluate()
    print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))


# Define the pipeline

In [None]:
def preprocess_input(data: pd.DataFrame):
    return F.to_tensor(text_transform(data.text.tolist()), padding_value=padding_idx)


def postprocess_output(model_output: torch.Tensor):
    return model_output.softmax(1)


# Let’s test the pipeline: it should output pairs of probabilities
postprocess_output(model(preprocess_input(dev_dataframe.head())))


In [None]:
import giskard

url = "http://localhost:9000"  # if Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation)
token = "API Access Token"  # Find your token in the Settings tab of your app

client = giskard.GiskardClient(url, token)

try:
    project = client.create_project(
        "tuned_text_classification",
        "Text Classification",
        "Project to classify text with finetuned models",
    )
except:
    project = client.get_project("tuned_text_classification")


In [None]:
dev_dataframe["label"]=dev_dataframe["label"].replace(0, "Negative")
dev_dataframe["label"]=dev_dataframe["label"].replace(1, "Positive")
dataset = giskard.Dataset(dev_dataframe, name="SST2 Data", target="label", cat_columns=["label"])

dataset.upload(client, project_key=project.project_key)

In [None]:
giskard_model = giskard.PyTorchModel(
    model,
    name="SST2-XLMR_BASE_ENCODER",
    model_type="classification",
    feature_names=["text"],
    data_preprocessing_function=preprocess_input,
    model_postprocessing_function=postprocess_output,
    classification_labels=["Negative", "Positive"],
    iterate_dataset=False,  # this is to let Giskard know that the output of our preprocessing function should be passed directly to the model
)

giskard_model.upload(
    client,
    project_key=project.project_key,
    validate_ds=dataset.slice(lambda x: x.head()),
)
