<a href="https://colab.research.google.com/github/MatthiasRemta/NLP_Project/blob/main/Transformer_classification_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from datasets import Dataset
import torch
import accelerate
import pandas as pd
import regex as re
import string
import numpy as np
import evaluate

In [6]:
#pip install transformers

In [7]:
#pip install datasets

In [8]:
#pip install evaluate

In [9]:
#!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/plots_genres_reduced_to_60.pkl

The following notebook is based on https://huggingface.co/docs/transformers/tasks/sequence_classification.

In [10]:
# load the data
#df_raw = pd.read_pickle('plots_genres_reduced_to_60.pkl')
df_raw = pd.read_pickle('Data\MovieSummaries\plots_genres_balanced.pkl')

In [11]:
# Specify mappings (id -> label) and (label -> id)
genres =[]
for row in df_raw['genre']:
  for genre in row:
    genres.append(genre)

unique_genres = []

for item in genres:
    if item not in unique_genres:
        unique_genres.append(item)

label2id = dict([(tuple[1], tuple[0]) for tuple in enumerate(unique_genres)])
id2label = dict([(label2id[key], key) for key in label2id])

In [12]:
# look at the mappings
print(label2id)
print(id2label)

{'cult': 0, 'horror': 1, 'short': 2, 'biography': 3, 'documentary': 4, 'thriller': 5, 'melodrama': 6, 'adventure': 7, 'supernatural': 8, 'mystery': 9, 'drama': 10, 'suspense': 11, 'action': 12, 'indie': 13, 'childrensfamily': 14, 'fantasy': 15, 'war': 16, 'childrens': 17, 'psychological thriller': 18, 'crime': 19, 'romance': 20, 'family': 21, 'actionadventure': 22, 'chinese': 23, 'comedy': 24, 'animation': 25, 'period piece': 26, 'biopic feature': 27, 'biographical': 28, 'silent': 29, 'comedydrama': 30, 'adaptation': 31, 'science fiction': 32, 'martial arts': 33, 'lgbt': 34, 'japanese': 35, 'teen': 36, 'creature': 37, 'slasher': 38, 'political': 39, 'television': 40, 'musical': 41, 'history': 42, 'coming of age': 43, 'noir': 44, 'sports': 45, 'bmovie': 46, 'blackandwhite': 47, 'spy': 48, 'music': 49, 'slapstick': 50, 'bollywood': 51, 'art': 52, 'western': 53, 'ensemble': 54, 'satire': 55, 'parody': 56}
{0: 'cult', 1: 'horror', 2: 'short', 3: 'biography', 4: 'documentary', 5: 'thriller'

In [13]:
# encode the labels as vector
def labels_to_binary(labels, unique_labels):
    binary_vector = np.zeros(len(unique_labels))
    for label in labels:
        binary_vector[unique_labels[label]] = 1
    return binary_vector


labels = []
for ele in df_raw['genre']:
    labels.append(labels_to_binary(ele, label2id))

df_raw['labels'] = labels

In [14]:
# convert to dataset
df_complete = Dataset.from_pandas(df_raw)

# Neuer Abschnitt

In [15]:
# define tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

0

In [16]:
# function for tokenization
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=tokenizer.model_max_length)

In [17]:
# preprocess the plot summaries
df_tokenized = df_complete.map(preprocess_function)

Map:   0%|          | 0/19494 [00:00<?, ? examples/s]

In [18]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
# define metrics
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [20]:
# Create train/test split
df_tokenized = df_tokenized.train_test_split(test_size=0.2)

In [21]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type='multi_label_classification'
    )

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
#pip install accelerate -U

In [23]:
# check whether cuda is available
torch.cuda.is_available()

True

In [37]:
transformers.__version__, accelerate.__version__

('4.32.1', '0.23.0')

In [41]:
# ToDo: Specify parameters correctly
# Train/Test split?
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_tokenized["train"],
    eval_dataset=df_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`