In [1]:
import os
import glob

import spacy
import pandas as pd
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
from transformers import pipeline, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer

from transformers import TrainingArguments, Trainer
import evaluate


import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score
from transformers import EvalPrediction
import torch


LABELS = ('fairness_and_equality', 'security_and_defense', 'crime_and_punishment', 'morality',
          'policy_prescription_and_evaluation', 'capacity_and_resources', 'economic', 'cultural_identity',
          'health_and_safety', 'quality_of_life', 'legality_constitutionality_and_jurisprudence',
          'political', 'public_opinion', 'external_regulation_and_reputation')

mlb = MultiLabelBinarizer()
mlb.fit([LABELS])

In [2]:
len(mlb.classes_)

14

# Join Datasets from different languages

In [9]:
df_paths = glob.glob(os.path.join('..', '..', 'data', 'preprocessed', '*train_and_dev.csv'))

In [10]:
for i, df_path_i in enumerate(df_paths):
    df_i = pd.read_csv(df_path_i, index_col='id')
    df_i['language'] = os.path.basename(df_path_i).split('_')[1]

    if i == 0:
        df = df_i
    else:
        df = pd.concat([df, df_i])

In [11]:
df

Unnamed: 0_level_0,frames,raw_text,title,content,title_and_5_sentences,title_and_10_sentences,title_and_first_paragraph,title_and_first_sentence_each_paragraph,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24151,"Legality_Constitutionality_and_jurisprudence,H...",Байдену напомнили о его отношению к абортам в ...,Байдену напомнили о его отношению к абортам в ...,CNN напомнил о трансформации отношения Байдена...,Байдену напомнили о его отношению к абортам в ...,Байдену напомнили о его отношению к абортам в ...,Байдену напомнили о его отношению к абортам в ...,CNN напомнил о трансформации отношения Байдена...,ru
24150,"Legality_Constitutionality_and_jurisprudence,H...",В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,В США после отмены права на аборт произошел на...,ru
24153,"Legality_Constitutionality_and_jurisprudence,F...","Байден пообещал американским мужчинам, что они...","Байден пообещал американским мужчинам, что они...",Решение Верховного суда об абортах ужасно. Не ...,"Байден пообещал американским мужчинам, что они...","Байден пообещал американским мужчинам, что они...","Байден пообещал американским мужчинам, что они...",Решение Верховного суда об абортах ужасно. «Эт...,ru
24152,"Legality_Constitutionality_and_jurisprudence,P...",Моё тело – моё дело: американки протестуют из-...,Моё тело – моё дело: американки протестуют из-...,Что случилось?\n\nВерховный суд США отменил ко...,Моё тело – моё дело: американки протестуют из-...,Моё тело – моё дело: американки протестуют из-...,Моё тело – моё дело: американки протестуют из-...,Что случилось? Верховный суд США отменил конст...,ru
24147,"Legality_Constitutionality_and_jurisprudence,E...",В США начались погромы из-за запрета абортов: ...,В США начались погромы из-за запрета абортов: ...,Почему скандальный закон приняли именно сейчас...,В США начались погромы из-за запрета абортов: ...,В США начались погромы из-за запрета абортов: ...,В США начались погромы из-за запрета абортов: ...,Почему скандальный закон приняли именно сейчас...,ru
...,...,...,...,...,...,...,...,...,...
25143,"Political,Security_and_defense","Kłopoty z dotarciem, czyli Objazdowy Cyrk Pana...","Kłopoty z dotarciem, czyli Objazdowy Cyrk Pana...","Każdy, kto miał kiedyś nowy samochód wie, że p...","Kłopoty z dotarciem, czyli Objazdowy Cyrk Pana...","Kłopoty z dotarciem, czyli Objazdowy Cyrk Pana...","Kłopoty z dotarciem, czyli Objazdowy Cyrk Pana...","Każdy, kto miał kiedyś nowy samochód wie, że p...",po
2528,"Policy_prescription_and_evaluation,Economic,Ex...",Rosja może uniknąć bankructwa. Zachodnia machi...,Rosja może uniknąć bankructwa. Zachodnia machi...,"Nie jest pewne, czy obecne sankcje wystarczą d...",Rosja może uniknąć bankructwa. Zachodnia machi...,Rosja może uniknąć bankructwa. Zachodnia machi...,Rosja może uniknąć bankructwa. Zachodnia machi...,"Nie jest pewne, czy obecne sankcje wystarczą d...",po
2530,"Policy_prescription_and_evaluation,Economic,Ex...",Embargo na gaz i ropę z Rosji? Jednoznaczne st...,Embargo na gaz i ropę z Rosji? Jednoznaczne st...,"Marine Le Pen, rywalka Emmanuela Macrona w wyś...",Embargo na gaz i ropę z Rosji? Jednoznaczne st...,Embargo na gaz i ropę z Rosji? Jednoznaczne st...,Embargo na gaz i ropę z Rosji? Jednoznaczne st...,"Marine Le Pen, rywalka Emmanuela Macrona w wyś...",po
2527,"Policy_prescription_and_evaluation,Economic,Qu...",Korwin-Mikke: Nakładanie na Rosję sankcji jest...,Korwin-Mikke: Nakładanie na Rosję sankcji jest...,"Prędzej Rosja sobie poradzi bez złota, niż my ...",Korwin-Mikke: Nakładanie na Rosję sankcji jest...,Korwin-Mikke: Nakładanie na Rosję sankcji jest...,Korwin-Mikke: Nakładanie na Rosję sankcji jest...,"Prędzej Rosja sobie poradzi bez złota, niż my ...",po


### Encode labels

In [12]:
labels_npy = mlb.transform(df.frames.str.lower().str.split(',')).astype(float)
df['label'] = [list(labels_npy[i, :]) for i in range(labels_npy.shape[0])]

### Do iterative stratification to create a train and holdout set, stratifying per and Create Dataset object

In [13]:
splits = 3
mskf = MultilabelStratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

In [7]:
splits = 3
mskf = MultilabelStratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

dataset = DatasetDict({})

train_dfs = {f'fold_{fold_i}': [] for fold_i in range(1, splits + 1)}
test_dfs = {f'fold_{fold_i}': [] for fold_i in range(1, splits + 1)}

for language, df_ in df.groupby('language'):

    X = df_[[col for col in df.columns if col not in ['label', 'frames']]]
    y = df_[[col for col in df.columns if col in ['label', 'frames']]]

    for fold_i, (train_index, test_index) in enumerate(mskf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        train_df_i = X_train.join(y_train)
        test_df_i = X_test.join(y_test)

        dataset[f'train_fold_{fold_i}_{language}'] = Dataset.from_pandas(train_df_i)
        dataset[f'test_fold_{fold_i}_{language}'] = Dataset.from_pandas(test_df_i)

        train_dfs[f'fold_{fold_i}'].append(train_df_i)
        test_dfs[f'fold_{fold_i}'].append(test_df_i)

for fold_i in range(1, splits + 1):
    dataset[f'train_fold_{fold_i}'] = Dataset.from_pandas(pd.concat(train_dfs[f'fold_{fold_i}']))
    dataset[f'test_fold_{fold_i}'] = Dataset.from_pandas(pd.concat(test_dfs[f'fold_{fold_i}']))


In [8]:
dataset

DatasetDict({
    train_fold_1_en: Dataset({
        features: ['raw_text', 'title', 'content', 'title_and_5_sentences', 'title_and_10_sentences', 'title_and_first_paragraph', 'title_and_first_sentence_each_paragraph', 'language', 'frames', 'label', 'id'],
        num_rows: 342
    })
    test_fold_1_en: Dataset({
        features: ['raw_text', 'title', 'content', 'title_and_5_sentences', 'title_and_10_sentences', 'title_and_first_paragraph', 'title_and_first_sentence_each_paragraph', 'language', 'frames', 'label', 'id'],
        num_rows: 172
    })
    train_fold_2_en: Dataset({
        features: ['raw_text', 'title', 'content', 'title_and_5_sentences', 'title_and_10_sentences', 'title_and_first_paragraph', 'title_and_first_sentence_each_paragraph', 'language', 'frames', 'label', 'id'],
        num_rows: 343
    })
    test_fold_2_en: Dataset({
        features: ['raw_text', 'title', 'content', 'title_and_5_sentences', 'title_and_10_sentences', 'title_and_first_paragraph', 'title_and

#### Save it

In [9]:
dataset.save_to_disk(os.path.join('..', '..', 'data', 'preprocessed','multilingual_train_test_ds.hf'))

Saving the dataset (0/1 shards):   0%|          | 0/342 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/172 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/343 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/171 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/343 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/171 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/140 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/71 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/118 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/58 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/202 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/202 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/202 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/129 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/65 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/129 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/65 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/130 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/64 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/127 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/64 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/127 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/64 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1059 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/530 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1059 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/530 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1060 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/529 [00:00<?, ? examples/s]

# Break dataset into chunks of 512 - tokens

In [14]:
import spacy
from math import ceil

LANGUAGES = ('en', 'it', 'fr', 'po', 'ru', 'ge')

SPACY_MODELS = {
    'en': {'small': 'en_core_web_sm', 'large': 'en_core_web_trf'},
    'it': {'small': 'it_core_news_sm', 'large': 'it_core_news_lg'},
    'fr': {'small': 'fr_core_news_sm', 'large': 'fr_dep_news_trf'},
    'po': {'small': 'pl_core_news_sm', 'large': 'pl_core_news_lg'},
    'ru': {'small': 'ru_core_news_sm', 'large': 'ru_core_news_lg'},
    'ge': {'small': 'de_core_news_sm', 'large': 'de_dep_news_trf'}
}


In [None]:
splits = 3
max_token_length = 350
word_overlap = 50
min_token_length = 30
UNITS_OF_ANALYSES = ('title', 'title_and_first_paragraph', 'title_and_5_sentences', 'title_and_10_sentences',
                     'title_and_first_sentence_each_paragraph', 'raw_text')

for u_analysis in ['raw_text']:#UNITS_OF_ANALYSES:
    print(u_analysis)
    dataset = DatasetDict({})

    train_dfs = {f'fold_{fold_i}': [] for fold_i in range(1, splits + 1)}
    test_dfs = {f'fold_{fold_i}': [] for fold_i in range(1, splits + 1)}

    for language, df_ in df.groupby('language'):
        print('\t' + language)

        nlp = spacy.load(SPACY_MODELS[language]['small'])

        X = df_[[col for col in df.columns if col not in ['label', 'frames']]]
        y = df_[[col for col in df.columns if col in ['label', 'frames']]]

        for fold_i, (train_index, test_index) in enumerate(mskf.split(X, y), start=1):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            train_df_i = X_train.join(y_train)
            test_df_i = X_test.join(y_test)

            # Subset unit of analysis and generate chunks
            train_df_i[f'{u_analysis}_chunked'] = [[doc[max(0, i * (max_token_length - word_overlap)): min(max((i + 1) * (max_token_length) - i * word_overlap, max_token_length), len(doc))].text for i in range(ceil(len(doc)/max_token_length)) ] for doc in  nlp.pipe(train_df_i[u_analysis], n_process=2)]
            test_df_i[f'{u_analysis}_chunked'] = [[doc[max(0, i * (max_token_length - word_overlap)): min(max((i + 1)* (max_token_length) - i * word_overlap, max_token_length), len(doc))].text for i in range(ceil(len(doc)/max_token_length)) ] for doc in  nlp.pipe(test_df_i[u_analysis], n_process=2)]

            train_df_i = train_df_i[[u_analysis, f'{u_analysis}_chunked', 'label', 'language']].explode(f'{u_analysis}_chunked')
            test_df_i = test_df_i[[u_analysis, f'{u_analysis}_chunked', 'label', 'language']].explode(f'{u_analysis}_chunked')

            # Filter out chunks of small length
            train_df_i['len_unit'] = [len(doc) for doc in nlp.pipe(train_df_i[u_analysis], n_process=2)]
            test_df_i['len_unit'] = [len(doc) for doc in nlp.pipe(test_df_i[u_analysis], n_process=2)]
            train_df_i['len_chunk'] = [len(doc) for doc in nlp.pipe(train_df_i[f'{u_analysis}_chunked'], n_process=2)]
            test_df_i['len_chunk'] = [len(doc) for doc in nlp.pipe(test_df_i[f'{u_analysis}_chunked'], n_process=2)]

            train_df_i = train_df_i[(train_df_i.len_unit == train_df_i.len_chunk) | (train_df_i.len_chunk > max_token_length)]
            test_df_i = test_df_i[(test_df_i.len_unit == test_df_i.len_chunk) | (test_df_i.len_chunk > max_token_length)]

            dataset[f'train_fold_{fold_i}_{language}'] = Dataset.from_pandas(train_df_i[[f'{u_analysis}_chunked', 'label']])
            dataset[f'test_fold_{fold_i}_{language}'] = Dataset.from_pandas(test_df_i[[f'{u_analysis}_chunked', 'label']])

            train_dfs[f'fold_{fold_i}'].append(train_df_i)
            test_dfs[f'fold_{fold_i}'].append(test_df_i)

    for fold_i in range(1, splits + 1):
        dataset[f'train_fold_{fold_i}'] = Dataset.from_pandas(pd.concat(train_dfs[f'fold_{fold_i}']))
        dataset[f'test_fold_{fold_i}'] = Dataset.from_pandas(pd.concat(test_dfs[f'fold_{fold_i}']))

    dataset.save_to_disk(os.path.join('..', '..', 'data', 'preprocessed',
                                      f'multilingual_train_test_{u_analysis}_max_words_length_{max_token_length}_min_words_length_{min_token_length}_chunk_word_overlap_{word_overlap}.hf'))

raw_text
	en
	fr
	ge


# Test a model

### Load datset

In [15]:
id2label= {idx:label for idx, label in enumerate(mlb.classes_)}
label2id = {label:idx for idx, label in enumerate(mlb.classes_)}

In [43]:
dataset = load_from_disk(os.path.join('..', '..', 'data', 'preprocessed','multilingual_train_test_ds.hf'))

In [8]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

In [9]:
def preprocess_data(examples, unit_of_analysis):
  # take a batch of texts
  text = examples[unit_of_analysis]

  # encode them
  #encoding = tokenizer(text, truncation=True)
  encoding = tokenizer(text, truncation=True, padding="max_length", max_length=512)
  #

  # Add their respective labels
  encoding["labels"] = examples['label']

  return encoding

In [10]:
encoded_dataset = dataset.map(lambda ex: preprocess_data(ex, 'raw_text'), batched=True, remove_columns=dataset['train'].column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large",
    #"distilbert-base-multilingual-cased",
    #'bert-base-multilingual-cased',
    problem_type="multi_label_classification",
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out

In [None]:
batch_size = 4
metric_name = "f1"


args = TrainingArguments(
    f"xlmRoberta-finetuned-sem_eval-task-3-subtask-2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    fp16=True
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [25]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average = 'micro')

    # return as dictionary
    metrics = {
        'f1': f1_micro_average,
        'precision': precision,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }

    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 1059
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1325
  Number of trainable parameters = 559904782


OutOfMemoryError: ignored

# Evaluate the best model on each dataset

In [16]:
best_model_path = os.path.join('..', '..', 'best_xlm_model', 'checkpoint-2120')

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    best_model_path,
    #"distilbert-base-multilingual-cased",
    #'bert-base-multilingual-cased',
    problem_type="multi_label_classification",
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id)

In [18]:
dataset = load_from_disk(os.path.join('..', '..', 'data', 'preprocessed','multilingual_train_test_ds.hf'))

In [19]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

In [20]:
def preprocess_data(examples, unit_of_analysis):
  # take a batch of texts
  text = examples[unit_of_analysis]

  # encode them
  #encoding = tokenizer(text, truncation=True)
  encoding = tokenizer(text, truncation=True, padding="max_length", max_length=512)
  #

  # Add their respective labels
  encoding["labels"] = examples['label']

  return encoding

In [21]:
encoded_dataset = dataset.map(lambda ex: preprocess_data(ex, 'raw_text'), batched=True, remove_columns=dataset['train'].column_names)

Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/train/cache-eb3640d20e1b4562.arrow
Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test/cache-8bf71e5f6545175b.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test_it/cache-d37e1f05fc654958.arrow
Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test_fr/cache-2a212ab60bdeedf8.arrow
Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test_po/cache-60a64546d63efa61.arrow
Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test_ru/cache-0f7d373aaa3f8d01.arrow
Loading cached processed dataset at /home/juanbermeo/SynologyDriveRA/Framing/data/preprocessed/multilingual_train_test_ds.hf/test_ge/cache-e62841b556e90b3c.arrow


In [26]:
batch_size = 1
metric_name = 'f1'
args = TrainingArguments(
    f"xlmRoberta-finetuned-sem_eval-task-3-subtask-2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    fp16=True
    #push_to_hub=True,
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test_ge"],
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [27]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 59
  Batch size = 1
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



KeyboardInterrupt



# Zero Shot classification

In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [None]:
classifier = pipeline(model='xlm-roberta-large', task='zero-shot-classification')

In [None]:
classifier = pipeline(model="joeddav/xlm-roberta-large-xnli", task="zero-shot-classification")

In [None]:
classifier = transformers.pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")