In [2]:
import pandas as pd
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


df = pd.read_csv("enron_emails_labeled.csv")
df["text"] = df["text"].fillna("").astype(str)
df["label"] = df["label"].fillna("General Inquiry").astype(str)

unique_labels = sorted(df["label"].unique())
label_feature = ClassLabel(names=unique_labels)

ds = Dataset.from_pandas(df)
ds = ds.cast_column("label", label_feature)


tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_ds = ds.map(tokenize, batched=True)

train_val_split = tokenized_ds.train_test_split(test_size=0.1)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

num_labels = len(unique_labels)


model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=num_labels
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[]
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds,   average='weighted')
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model("./email_classifier_model")
tokenizer.save_pretrained("./email_classifier_model")

RuntimeError: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at /dev/null:228; latest registration was registered at /dev/null:228