<a href="https://colab.research.google.com/github/MatthiasRemta/NLP_Project/blob/main/Transformer_classification_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from datasets import Dataset
import torch
import accelerate
import pandas as pd
import regex as re
import string
import numpy as np
import evaluate

In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install evaluate

In [2]:
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/plots_genres_reduced_to_60.pkl

--2023-10-20 14:46:47--  https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/plots_genres_reduced_to_60.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75219388 (72M) [application/octet-stream]
Saving to: ‘plots_genres_reduced_to_60.pkl’


2023-10-20 14:46:48 (181 MB/s) - ‘plots_genres_reduced_to_60.pkl’ saved [75219388/75219388]



The following notebook is based on https://huggingface.co/docs/transformers/tasks/sequence_classification.

In [3]:
# load the data
df_raw = pd.read_pickle('plots_genres_reduced_to_60.pkl')

In [4]:
# Specify mappings (id -> label) and (label -> id)
genres =[]
for row in df_raw['genre']:
  for genre in row:
    genres.append(genre)

unique_genres = []

for item in genres:
    if item not in unique_genres:
        unique_genres.append(item)

label2id = dict([(tuple[1], tuple[0]) for tuple in enumerate(unique_genres)])
id2label = dict([(label2id[key], key) for key in label2id])

In [5]:
# look at the mappings
print(label2id)
print(id2label)

{'drama': 0, 'comedydrama': 1, 'cult': 2, 'horror': 3, 'actionadventure': 4, 'fantasy': 5, 'comedy': 6, 'action': 7, 'silent': 8, 'indie': 9, 'blackandwhite': 10, 'war': 11, 'thriller': 12, 'science fiction': 13, 'noir': 14, 'crime': 15, 'chinese': 16, 'western': 17, 'satire': 18, 'mystery': 19, 'adaptation': 20, 'period piece': 21, 'romance': 22, 'biography': 23, 'adventure': 24, 'history': 25, 'parody': 26, 'bmovie': 27, 'creature': 28, 'political': 29, 'martial arts': 30, 'sports': 31, 'psychological thriller': 32, 'suspense': 33, 'slasher': 34, 'japanese': 35, 'animation': 36, 'lgbt': 37, 'coming of age': 38, 'teen': 39, 'childrens': 40, 'short': 41, 'slapstick': 42, 'childrensfamily': 43, 'family': 44, 'documentary': 45, 'music': 46, 'supernatural': 47, 'musical': 48, 'biopic feature': 49, 'biographical': 50, 'ensemble': 51, 'art': 52, 'bollywood': 53, 'television': 54, 'melodrama': 55, 'spy': 56}
{0: 'drama', 1: 'comedydrama', 2: 'cult', 3: 'horror', 4: 'actionadventure', 5: 'fan

In [7]:
# encode the labels as vector
def labels_to_binary(labels, unique_labels):
    binary_vector = np.zeros(len(unique_labels))
    for label in labels:
        binary_vector[unique_labels[label]] = 1
    return binary_vector


labels = []
for ele in df_raw['genre']:
    labels.append(labels_to_binary(ele, label2id))

df_raw['labels'] = labels

In [8]:
# convert to dataset
df_complete = Dataset.from_pandas(df_raw)

# Neuer Abschnitt

In [9]:
# define tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [10]:
# function for tokenization
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=tokenizer.model_max_length)

In [11]:
# preprocess the plot summaries
df_tokenized = df_complete.map(preprocess_function)

Map:   0%|          | 0/41549 [00:00<?, ? examples/s]

In [12]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
# define metrics
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [14]:
# Create train/test split
df_tokenized = df_tokenized.train_test_split(test_size=0.2)

In [15]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type='multi_label_classification'
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m163.8/258.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [16]:
# check whether cuda is available
torch.cuda.is_available()

True

In [None]:
# ToDo: Specify parameters correctly
# Train/Test split?
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_tokenized["train"],
    eval_dataset=df_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1139,0.115182,0.48185,0.679092,0.113959
