In [1]:
from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion")

No config specified, defaulting to: emotion/split
Found cached dataset emotion (C:/Users/Matheus/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
train_ds = dataset['train'].to_pandas().head(16)
test_ds = dataset['test'].to_pandas().head(16)
val_ds = dataset['validation'].to_pandas().head(16)

In [63]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("sagui-nlp/debertinha-ptbr-xsmall")
# model = AutoModelForSequenceClassification.from_pretrained("sagui-nlp/debertinha-ptbr-xsmall", num_labels=train_ds['label'].nunique())

In [65]:
tokenizer('olá')['input_ids']

[50269, 11, 64, 73, 50266]

In [4]:
# create a class to encode the text

import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = tokenizer.batch_encode_plus(data['text'].tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        self.data['labels'] = data['label']
    
    def __len__(self):
        return len(self.data['input_ids'])
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

train_dataset = TextDataset(train_ds)
val_dataset = TextDataset(val_ds)
test_dataset = TextDataset(test_ds)

In [5]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

### MLFlow Setup

In [11]:
import mlflow
from mlflow import MlflowClient

# start the mlflow server
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# mlflow server --host 127.0.0.1 --port 8080
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

# create a new experiment

experiment_tags = {
    "project_name": "emotion-classification-debertinha",
    "team": "nlp",
    "mlflow.note.content": "emotion classification with debertinha",
}

experiment_id = client.create_experiment(name="emotion-classification", tags=experiment_tags)

experiment = mlflow.set_experiment("emotion-classification")

run_name = "emotion-classification-debertinha-run"

artifact_path = "emotion-classification-debertinha"

# create a new run
# run = client.create_run(experiment_id)

In [15]:
epochs = 5
lr = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

criteria = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
model = model.to(device)

train_loss = []
val_loss = []

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criteria(outputs.logits, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_loss.append(epoch_loss / len(train_loader))

    model.eval()
    with torch.no_grad():
        eval_loss = 0
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criteria(outputs.logits, labels)
            eval_loss += loss.item()

        val_loss.append(eval_loss / len(val_loader))

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f}")


Epoch 1/5 - Loss: 1.7073
Epoch 2/5 - Loss: 1.6896
Epoch 3/5 - Loss: 1.6700
Epoch 4/5 - Loss: 1.6454
Epoch 5/5 - Loss: 1.6252


In [29]:
import os

with mlflow.start_run(run_name=run_name, experiment_id=experiment_id) as run:
    mlflow.log_params({
        "epochs": epochs,
        "lr": lr,
        "batch_size": 8,
        "device": device.type
    })

    for index, loss in enumerate(train_loss):
        mlflow.log_metric("train_loss", loss, step=index)
        mlflow.log_metric("val_loss", val_loss[index], step=index)

    train_ds.to_csv("data/train.csv", index=False)
    val_ds.to_csv("data/val.csv", index=False)
    test_ds.to_csv("data/test.csv", index=False)

    mlflow.log_artifact("data/train.csv", artifact_path + "/data")
    mlflow.log_artifact("data/val.csv", artifact_path + "/data")
    mlflow.log_artifact("data/test.csv", artifact_path + "/data")

    #save tokenizer
    tokenizer.save_pretrained(artifact_path + "/tokenizer")

    for file in os.listdir(artifact_path + "/tokenizer"):
        mlflow.log_artifact(artifact_path + "/tokenizer/" + file, artifact_path + "/tokenizer")

    # delete the tokenizer folder
    os.system(f"rm -rf {artifact_path}/tokenizer")

    #mlflow.pytorch.save_model(model, artifact_path)
    
    mlflow.pytorch.log_model(model, artifact_path + "/model")



In [18]:
artifact_path

'emotion-classification-debertinha'

### Predict with model

In [53]:
transformer_model = {"model": model, "tokenizer": tokenizer}
task = "text-classification"

with mlflow.start_run(run_name=run_name, experiment_id=experiment_id) as run:
    model_info = mlflow.transformers.log_model(
        transformers_model=transformer_model,
        artifact_path="text_classifier",
        task=task,
    )




In [54]:
logged_model = 'runs:/bf5f21c3e1764f8d86fe6f6126633d70/text_classifier'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame({"text": ["Eu estou muito feliz", "Estou triste", "Estou com raiva"]}))

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/02/24 22:10:57 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Unnamed: 0,label,score
0,LABEL_2,0.196948
1,LABEL_0,0.199091
2,LABEL_2,0.193724


In [58]:
loaded_pipeline = mlflow.transformers.load_model(
    model_info.model_uri, return_type="pipeline"
)

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2024/02/24 22:13:10 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
2024/02/24 22:13:44 INFO mlflow.transformers: 'runs:/bf5f21c3e1764f8d86fe6f6126633d70/text_classifier' resolved as 'mlflow-artifacts:/423481557471868563/bf5f21c3e1764f8d86fe6f6126633d70/artifacts/text_classifier'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



In [70]:
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512,'return_tensors':'pt'}

In [73]:
type(loaded_pipeline)

transformers.pipelines.text_classification.TextClassificationPipeline

In [90]:
kwargs = {"truncation": True, "padding": "max_length", "max_length": 1}
result = loaded_pipeline(['olá', 'ok'], **kwargs)
result

[{'label': 'LABEL_0', 'score': 0.19527024030685425},
 {'label': 'LABEL_0', 'score': 0.1908813863992691}]

In [94]:
# compare directly inference

_data = tokenizer.batch_encode_plus(['olá', 'ok'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
outputs = model(**_data)

#print probability of each class
outputs.logits.softmax(dim=1)

tensor([[0.1953, 0.1893, 0.1502, 0.1853, 0.1468, 0.1331],
        [0.1909, 0.1878, 0.1550, 0.1841, 0.1489, 0.1333]],
       grad_fn=<SoftmaxBackward0>)