In [None]:
import torch
from torch import nn
import transformers
print(transformers.__version__)
print(torch.__version__)

In [None]:
from transformers import BertTokenizerFast
from model import BertForMultiHeadConfig, BertForMultiHeadModel

config = BertForMultiHeadConfig.from_pretrained('bert-base-german-cased', num_labels=2, num_multi_labels=10)
print(config.num_labels, config.num_multi_labels)
model = BertForMultiHeadModel.from_pretrained('bert-base-german-cased', config=config)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
inputs = tokenizer(["Hallo, das ist nicht so cool!"]*3, return_tensors='pt', padding=True, truncation=True)

In [None]:
classification_labels = torch.randint(10, size=(3,))

In [None]:
multilabel_classification_labels = torch.empty(3, 10).random_(2)

In [None]:
regression_target = torch.rand(3)

In [None]:
opt = torch.optim.AdamW(params=model.parameters(), lr=5e-5)

In [None]:
list(model.classification_head.parameters())

In [None]:
for _ in range(20):
    opt.zero_grad()
    outputs = model(labels=classification_labels, multi_labels=multilabel_classification_labels, regression_targets=regression_target, **inputs)
    outputs.loss.backward()
    opt.step()

In [None]:
list(model.classification_head.parameters())[0]

In [None]:
model.save_pretrained('test')

In [None]:
model = BertForMultiHeadModel.from_pretrained('test', use_cache=False)

In [None]:
list(model.classification_head.parameters())

In [None]:
#from datasets import load_dataset
# genres field is a problem?!
#load_dataset('json', data_files={'train': '../data/imdb_train.json', 'val': '../data/imdb_val.json', 'test':'../data/imdb_test.json'})

In [1]:
import pandas as pd

df_train = pd.read_json('../data/imdb_train.json')
df_val = pd.read_json('../data/imdb_val.json')
df_test = pd.read_json('../data/imdb_test.json')

df_train.shape, df_val.shape, df_test.shape

((14832, 11), (781, 11), (2342, 11))

In [2]:
from datasets import Dataset, DatasetDict


dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train, split='train'),
    'val': Dataset.from_pandas(df_val, split='train'),
    'test': Dataset.from_pandas(df_test, split='test')
})

In [3]:
def make_classlabel_encoder(column_name, class_labels):
    def encoder(entry):
        return {column_name: class_labels.str2int(entry[column_name])}
    return encoder

In [4]:
from datasets import ClassLabel
uniq_title_type_labels = list(set(dataset['train']['titleType']))
title_type_labels = ClassLabel(num_classes=len(uniq_title_type_labels), names=uniq_title_type_labels)

title_type_encoder = make_classlabel_encoder('titleType', title_type_labels)
dataset = dataset.map(title_type_encoder, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [5]:
updated_features = dataset['train'].features.copy()
updated_features['titleType'] = title_type_labels

dataset.cast(updated_features)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['IMDB_ID', 'averageRating', 'duration', 'genre', 'isAdult', 'numVotes', 'primaryTitle', 'runtimeMinutes', 'text', 'titleType', 'year'],
        num_rows: 14832
    })
    val: Dataset({
        features: ['IMDB_ID', 'averageRating', 'duration', 'genre', 'isAdult', 'numVotes', 'primaryTitle', 'runtimeMinutes', 'text', 'titleType', 'year'],
        num_rows: 781
    })
    test: Dataset({
        features: ['IMDB_ID', 'averageRating', 'duration', 'genre', 'isAdult', 'numVotes', 'primaryTitle', 'runtimeMinutes', 'text', 'titleType', 'year'],
        num_rows: 2342
    })
})

In [6]:
import numpy as np

def make_multilabel_encoder(column_name, class_labels):
    n_labels = len(genre_labels.names)
    def encoder(entry):
        binarized = np.zeros(n_labels, dtype='int8')
        label_idc = [class_labels.str2int(label) for label in entry[column_name]]
        binarized[label_idc] = 1
        return {column_name: binarized}
    return encoder

In [8]:
uniq_genres = set()
[uniq_genres.update(entry) for entry in dataset['train']['genre']]
uniq_genres = list(uniq_genres)

genre_labels = ClassLabel(num_classes=len(uniq_genres), names=uniq_genres)

genre_labels_encoder = make_multilabel_encoder('genre', genre_labels)
dataset = dataset.map(genre_labels_encoder)

  0%|          | 0/14832 [00:00<?, ?ex/s]

  0%|          | 0/781 [00:00<?, ?ex/s]

  0%|          | 0/2342 [00:00<?, ?ex/s]

In [11]:
updated_features = dataset['train'].features.copy()
updated_features['genre'] = genre_labels

dataset.cast(updated_features)

  0%|          | 0/2 [00:00<?, ?ba/s]

ArrowNotImplementedError: Unsupported cast from list<item: int64> to int64 using function cast_int64

In [12]:
setattr(dataset, '_genre_info', genre_labels)

In [13]:
dataset._genre_info

ClassLabel(num_classes=25, names=['Crime', 'Mystery', 'Short', 'Sport', 'Comedy', 'History', 'Music', 'Horror', 'Sci-Fi', ' Fantasy', 'War', 'Romance', 'Western', 'Family', 'Fantasy', 'Animation', 'Documentary', 'Action', 'Musical', 'Adventure', 'Talk-Show', 'Thriller', 'News', 'Drama', 'Biography'], names_file=None, id=None)

In [15]:
dataset.save_to_disk('../data/imdb_huggingface')

In [17]:
from datasets import load_from_disk
dataset = load_from_disk('../data/imdb_huggingface')

In [20]:
dataset._genre_info

AttributeError: 'DatasetDict' object has no attribute '_genre_info'

In [23]:
from datasets import DatasetInfo
DatasetInfo()

DatasetInfo(description='', citation='', homepage='', license='', features=None, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [24]:
from datasets import load_dataset

dataset = load_dataset("daily_dialog")

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset daily_dialog/default (download: 4.27 MiB, generated: 8.23 MiB, post-processed: Unknown size, total: 12.50 MiB) to /mnt/data/users/keller/.cache/daily_dialog/default/1.0.0/c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c...


Downloading:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset daily_dialog downloaded and prepared to /mnt/data/users/keller/.cache/daily_dialog/default/1.0.0/c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c. Subsequent calls will reuse this data.
