<a href="https://colab.research.google.com/github/MatthiasRemta/NLP_Project/blob/main/Transformer_classification_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# for Google-Colab

# packages
!pip install transformers &> /dev/null
!pip install datasets &> /dev/null
!pip install evaluate &> /dev/null
!pip install accelerate -U &> /dev/null
!pip install huggingface_hub &> /dev/null

In [2]:
# if you want to upload models to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# datasets
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/train_plots_genres_reduced_to_60.pkl &> /dev/null
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/test_plots_genres_reduced_to_60.pkl &> /dev/null

!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/train_plots_genres_balanced.pkl &> /dev/null
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/test_plots_genres_balanced.pkl &> /dev/null

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EvalPrediction
from transformers import TextClassificationPipeline
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report
from datasets import Dataset
import torch
import accelerate
import pandas as pd
import regex as re
import string
import numpy as np
import evaluate
import pickle
import os

The following notebook is based on https://huggingface.co/docs/transformers/tasks/sequence_classification.

In [33]:
# load the data
df_train_raw = pd.read_pickle('train_plots_genres_reduced_to_60.pkl')
df_test_raw = pd.read_pickle('test_plots_genres_reduced_to_60.pkl')
#df_train_raw = pd.read_pickle('train_plots_genres_balanced.pkl')
#df_test_raw = pd.read_pickle('test_plots_genres_balanced.pkl')

In [34]:
# Specify mappings (id -> label) and (label -> id)
genres =[]
for row in df_train_raw['genre']:
  for genre in row:
    genres.append(genre)

unique_genres = []

for item in genres:
    if item not in unique_genres:
        unique_genres.append(item)

label2id = dict([(tuple[1], tuple[0]) for tuple in enumerate(unique_genres)])
id2label = dict([(label2id[key], key) for key in label2id])

In [35]:
# look at the mappings
print(label2id)
print(id2label)

{'romance': 0, 'drama': 1, 'blackandwhite': 2, 'biopic feature': 3, 'thriller': 4, 'war': 5, 'comedy': 6, 'coming of age': 7, 'comedydrama': 8, 'indie': 9, 'silent': 10, 'lgbt': 11, 'crime': 12, 'horror': 13, 'actionadventure': 14, 'action': 15, 'adventure': 16, 'martial arts': 17, 'fantasy': 18, 'western': 19, 'period piece': 20, 'adaptation': 21, 'science fiction': 22, 'parody': 23, 'satire': 24, 'biography': 25, 'short': 26, 'family': 27, 'musical': 28, 'animation': 29, 'television': 30, 'bollywood': 31, 'bmovie': 32, 'mystery': 33, 'chinese': 34, 'documentary': 35, 'childrensfamily': 36, 'childrens': 37, 'teen': 38, 'psychological thriller': 39, 'noir': 40, 'japanese': 41, 'suspense': 42, 'supernatural': 43, 'spy': 44, 'cult': 45, 'slasher': 46, 'melodrama': 47, 'art': 48, 'political': 49, 'history': 50, 'biographical': 51, 'sports': 52, 'creature': 53, 'music': 54, 'ensemble': 55, 'slapstick': 56}
{0: 'romance', 1: 'drama', 2: 'blackandwhite', 3: 'biopic feature', 4: 'thriller', 5

In [36]:
# encode the labels as vector
def labels_to_binary(labels, unique_labels):
    binary_vector = np.zeros(len(unique_labels))
    for label in labels:
        binary_vector[unique_labels[label]] = 1
    return binary_vector


labels = []
for ele in df_train_raw['genre']:
    labels.append(labels_to_binary(ele, label2id))

df_train_raw['labels'] = labels

labels = []
for ele in df_test_raw['genre']:
    labels.append(labels_to_binary(ele, label2id))

df_test_raw['labels'] = labels

In [37]:
# convert to dataset
df_train = Dataset.from_pandas(df_train_raw)
df_test = Dataset.from_pandas(df_test_raw)

In [10]:
# define tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased",
                                          truncation=True,
                                          padding='max_length',
                                          max_length=512)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

0

In [None]:
# function for tokenization
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=tokenizer.model_max_length)

In [None]:
# preprocess the plot summaries
df_train_tokenized = df_train.map(preprocess_function)

Map:   0%|          | 0/33239 [00:00<?, ? examples/s]

In [None]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# define metrics
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
# Create train/validation split
df_train_tokenized = df_train_tokenized.train_test_split(test_size=0.2)

In [None]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type='multi_label_classification'
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# alternatively, load checkpoint from previous finetuning
model = AutoModelForSequenceClassification.from_pretrained("matthiasr/genre_pred_model_reduced")

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# check whether cuda is available
print(torch.cuda.is_available())

True


In [None]:
# finetune model
training_args = TrainingArguments(
    output_dir="genre_pred_model_reduced_6_epochs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_train_tokenized["train"],
    eval_dataset=df_train_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1063,0.100637,0.559297,0.717157,0.15012
2,0.0957,0.098809,0.57928,0.731812,0.15012


TrainOutput(global_step=3324, training_loss=0.10140883391801488, metrics={'train_runtime': 2679.6884, 'train_samples_per_second': 19.846, 'train_steps_per_second': 1.24, 'total_flos': 7051791145347072.0, 'train_loss': 0.10140883391801488, 'epoch': 2.0})

In [11]:
def predict_genres(text, tokenizer, model, id2label, threshold=0.5):
  # Tokenize the text and get model predictions
  inputs = tokenizer(text, truncation=True, padding='max_length', max_length=tokenizer.model_max_length, return_tensors="pt")
  outputs = model(**inputs)

  # Get the predicted logits (scores) for each label
  logits = outputs.logits
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits)

  # Apply threshold to determine the labels
  predicted_labels = (probs > threshold).tolist()[0]

  # convert ids to actual labels
  indices = [i for i, x in enumerate(predicted_labels) if x]
  genres = [id2label[x] for x in indices]
  return genres


In [14]:
# put model into eval mode
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# pipeline for Inference
# this takes quite some time
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=None)

tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
pred = pipe(df_test_raw['text'].to_list(), **tokenizer_kwargs)

with open('transformer_reduced_scores.pkl', 'wb') as f:
    pred = pickle.dump(pred, f)

In [38]:
# load saved scores, faster than predicting each time
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/transformer_balanced_scores.pkl &> /dev/null
!wget https://raw.githubusercontent.com/MatthiasRemta/NLP_Project/main/Data/MovieSummaries/transformer_reduced_scores.pkl &> /dev/null

with open('transformer_reduced_scores.pkl', 'rb') as f:
    pred = pickle.load(f)

The model outputs a list of lists. Each of this inner lists contains one dictionary per class (label). Before we can calculate performance metrics, we need to untangle this datastructure. Also, the models just returns probabilities for each class, which need to be converted into binary.

In [44]:
threshold = 0.5

pred_list = []
for movie in pred:
  score = [0.0 for i in np.arange(0, len(label2id))]
  for label in movie:
    if label['score'] > threshold:
      idx = label2id[label['label']]
      score[idx] = 1.0
  pred_list.append(score)

In [45]:
print(classification_report(y_true=df_test['labels'], y_pred=pred_list))

              precision    recall  f1-score   support

           0       0.61      0.38      0.47      1272
           1       0.71      0.76      0.74      3863
           2       0.61      0.34      0.43       757
           3       0.00      0.00      0.00        59
           4       0.64      0.48      0.55      1308
           5       0.60      0.40      0.48      1297
           6       0.75      0.56      0.64      2549
           7       0.00      0.00      0.00       143
           8       0.00      0.00      0.00       262
           9       0.46      0.06      0.10       729
          10       0.61      0.18      0.28       248
          11       0.72      0.08      0.14       162
          12       0.67      0.50      0.57      1015
          13       0.78      0.74      0.76       827
          14       0.52      0.31      0.39       712
          15       0.65      0.45      0.53      1209
          16       0.69      0.36      0.47       729
          17       0.61    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
#write classification report to csv
report = classification_report(y_true=df_test['labels'], y_pred=pred_list, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('classification_report_reduced_Transformer.csv', index=True, sep=';')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Alternative faster prediction

In [None]:
!pip install tqdm &> /dev/null

In [None]:
from tqdm import tqdm

model = model.to('cuda')

def predict_genres_with_probs(texts, tokenizer, model, id2label, threshold=0.5):
    with torch.no_grad():  # Speed up by not tracking gradients
        inputs = tokenizer(texts, truncation=True, padding='max_length', max_length=tokenizer.model_max_length, return_tensors="pt").to('cuda')
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.Sigmoid()(logits)
        predicted_labels = (probs > threshold).cpu().tolist()
        probs_list = probs.cpu().tolist()

    results = []
    for labels, prob_values in zip(predicted_labels, probs_list):
        indices = [i for i, x in enumerate(labels) if x]
        genres = [id2label[x] for x in indices]
        genre_probs = [prob_values[x] for x in indices]
        results.append({"genres": genres, "probs": genre_probs})
        
    return results

# predict
batch_size = 32
predictions = []
for i in tqdm(range(0, len(df_test_raw['text']), batch_size), desc="Predicting", unit="batch"):
    batch_texts = df_test_raw['text'][i: i + batch_size].to_list()
    batch_predictions = predict_genres_with_probs(batch_texts, tokenizer, model, id2label)
    predictions.extend(batch_predictions)


In [None]:
# Convert the list of dictionaries to a DataFrame and then to a dataset
df_predictions = pd.DataFrame(predictions)


labels = []
for ele in df_predictions['genres']:
    labels.append(labels_to_binary(ele, label2id))

df_predictions['labels'] = labels

ds_predictions = Dataset.from_pandas(df_predictions)

#print classification_report

print(classification_report(y_true=df_test['labels'], y_pred=ds_predictions["labels"], target_names=unique_genres))

In [None]:
report = classification_report(y_true=df_test['labels'], y_pred=ds_predictions["labels"], target_names=unique_genres, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report.to_csv('classification_report_reduced_Transformer_2.csv', index=True, sep=';')