In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [None]:
! pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [None]:
! pip install datasets transformers[sentencepiece]

Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import evaluate
from datasets import Dataset, Features, Value, ClassLabel

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)
    df = df[['text', 'labels']]
    return df

In [None]:
final_training_set = load_data('final_training_set_fully_balanced_shuffled.csv')

In [None]:
final_validation_set = load_data('final_validation_set.csv')

In [None]:
class_names = ['Non-anti-LGBT+ content', 'Homophobia', 'Transphobia']

In [None]:
final_training_set = Dataset.from_pandas(final_training_set, features=Features({'text': Value('string'), 'labels': ClassLabel(num_classes=3, names=class_names)}))
final_validation_set = Dataset.from_pandas(final_validation_set, features=Features({'text': Value('string'), 'labels': ClassLabel(num_classes=3, names=class_names)}))

In [None]:
print(final_training_set)

Dataset({
    features: ['text', 'labels'],
    num_rows: 120501
})


In [None]:
print(final_validation_set)

Dataset({
    features: ['text', 'labels'],
    num_rows: 9964
})


In [None]:
checkpoint = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets_train = final_training_set.map(tokenize_function, batched=True)
tokenized_datasets_validation = final_validation_set.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/120501 [00:00<?, ? examples/s]

Map:   0%|          | 0/9964 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer",
    save_total_limit=20,
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_loss",
    num_train_epochs=15,
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_validation,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1547,0.359044,0.88599
2,0.082,0.478257,0.890707
3,0.0494,0.497375,0.905861
4,0.0338,0.605533,0.905961
5,0.0268,0.700022,0.903854
6,0.0205,0.638885,0.909675
7,0.0074,0.811135,0.906062
8,0.0073,0.74508,0.909574
9,0.0112,0.823636,0.901345
10,0.0077,0.824136,0.911782


KeyboardInterrupt: 

In [None]:
trainer.save_model("final_model")