In [19]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset, load_from_disk
#import datasets
from transformers import DataCollatorWithPadding

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

### Model parameters
# we will use with Distil-BERT
language_model_name = "distilbert-base-uncased"

### Training Argurments

# this GPU should be enough for this task to handle 32 samples per batch
batch_size = 32

# optim
learning_rate = 1e-4
# weight_decay = 0.001 # we could use e.g. 0.01 in case of very low and very high amount of data for regularization

# For now I'm going to use 0.01
weight_decay = 0.01

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"


set_seed(42)


### Dataset preparation

In [20]:
dataset = load_from_disk('../datasets')

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['item', 'name', 'description', 'type', 'category', 'subcategory', 'label'],
        num_rows: 6251
    })
    validation: Dataset({
        features: ['item', 'name', 'description', 'type', 'category', 'subcategory', 'label'],
        num_rows: 300
    })
})

In [22]:
dataset = dataset.remove_columns(['item', 'name', 'type', 'category', 'subcategory'])

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'label'],
        num_rows: 6251
    })
    validation: Dataset({
        features: ['description', 'label'],
        num_rows: 300
    })
})

In [39]:
print(f"Description: {dataset['train']['description'][1]}")
print(f"Classification: {dataset['train']['label'][1]}")

Description: American dance-punk band from California
Classification: cultural representative


In [32]:
labels = {'cultural agnostic': 0, 'cultural representative': 1, 'cultural exclusive': 2}

In [33]:
def map_labels(data):
    data['label'] = labels[data['label']]
    return data

train_dataset = dataset['train'].map(map_labels)
val_dataset = dataset['validation'].map(map_labels)

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [36]:
print(train_dataset)

Dataset({
    features: ['description', 'label'],
    num_rows: 6251
})


In [40]:
print(f"Description: {train_dataset['description'][1]}")
print(f"Classification: {train_dataset['label'][1]}")

Description: American dance-punk band from California
Classification: 1


### Metrics
To evaluate the model we use:
- Accuracy metric
- F1 score

In [61]:
import evaluate

def compute_metrics(eval_pred):
    load_accuracy = evaluate.load('accuracy')
    load_f1 = evaluate.load('f1')

    logits, labels = eval_pred
    preidctions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=preidctions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=preidctions, references=labels, average='macro')['f1']

    return {"accuracy": accuracy, "f1": f1}

### The Model

In [44]:
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, 
                                                                   output_hidden_states=False,
                                                                   num_labels=3) # number of the classes
# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["description"], padding=True, truncation=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# Tokenize the dataset
tokenized_training_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

In [47]:
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [46]:
training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,                  # learning rate
    report_to="none"
    #logging_dir="sentiment_analysis_logs"         # use it later to get the training curves
)

In [63]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_training_dataset,
   eval_dataset=tokenized_val_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [50]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=196, training_loss=0.7890559605189732, metrics={'train_runtime': 1416.0081, 'train_samples_per_second': 4.415, 'train_steps_per_second': 0.138, 'total_flos': 103508559520128.0, 'train_loss': 0.7890559605189732, 'epoch': 1.0})

In [64]:
trainer.evaluate()

{'eval_loss': 0.670707643032074,
 'eval_model_preparation_time': 0.0027,
 'eval_accuracy': 0.7,
 'eval_f1': 0.6778136685196451,
 'eval_runtime': 8.1633,
 'eval_samples_per_second': 36.75,
 'eval_steps_per_second': 4.655}