In [2]:
from datasets import load_dataset

dataset = load_dataset("hossein20s/enrun-emails-text-classification")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 14459
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 3104
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 3099
    })
})

In [3]:
df = dataset["train"].to_pandas()
df["label"].unique()

array([0, 1], dtype=int64)

In [4]:
df = df.head(500)
df

Unnamed: 0,label,text
0,0,Thanks !! I am so ready for a good homegrown t...
1,1,Gail Livingston Mills Attorney at Law Direct :...
2,0,"It would be nice , but not all mobile devices ..."
3,1,"Anthony ( "" Tony "" ) Harlow Attorney at Law Di..."
4,0,Iâ ?? ve attached a PDF of a working draft of ...
...,...,...
495,1,"Sindy Rousseau District Manager , Northern CA ..."
496,0,"Vic , Neil asked me to relay the message to yo..."
497,1,"Paul W. Smail , Esq. . Associate Attorney Pers..."
498,0,Updated spreadsheet attached and updated numbe...


In [5]:
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 11567
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2892
    })
})

In [6]:
checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [8]:
def tokenize(sample):
    return tokenizer(sample["text"], truncation=True)

In [9]:
tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/11567 [00:00<?, ? examples/s]

Map:   0%|          | 0/2892 [00:00<?, ? examples/s]

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    metric_for_best_model="f1",
    num_train_epochs=1,
    learning_rate=0.001,
    # report_to="none",
)




In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
import evaluate
import numpy as np

metric = evaluate.load("f1")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [15]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [16]:
import torch

In [17]:
torch.cuda.is_available()
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
device

'cpu'

In [19]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
