In [None]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
import numpy as np
from datasets import load_metric

In [None]:
src_path = 'dataset_path/'
MODEL_NAME = 'model_path/'


In [None]:
train_file = src_path + "train.csv"
test_file = src_path + "test.csv"

dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
def get_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=512) #change according to model 
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:,0,:].squeeze().numpy()) #token CLS
    return embeddings

train_embeddings = get_embeddings(dataset['train']['text'])
test_embeddings = get_embeddings(dataset['test']['text'])

In [None]:
svm = SVC(kernel='rbf')
svm.fit(train_embeddings, dataset['train']['label'])

predictions = svm.predict(test_embeddings)

In [None]:
# Carregar métricas
accuracy = load_metric("accuracy")
f1 = load_metric("f1")
recall = load_metric("recall")
precision = load_metric("precision")

def compute_metrics_svm(predictions, labels):
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average='macro'),
        "recall": recall.compute(predictions=predictions, references=labels, average='macro'),
        "precision": precision.compute(predictions=predictions, references=labels, average='macro')
    }

In [None]:
metrics = compute_metrics_svm(predictions, dataset['test']['label'])

print(metrics)