In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec

import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument

from sklearn.model_selection import KFold

from sklearn.base import clone as sklearn_clone

from datasets import Dataset, DatasetDict
import pandas as pd
import torch

from transformers import AutoTokenizer

from transformers import BertAdapterModel, BertConfig

from transformers import PfeifferConfig

from transformers import TrainingArguments, AdapterTrainer, EvalPrediction
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

torch.cuda.is_available()

True

In [2]:
# https://stackoverflow.com/questions/58497442/best-training-methods-for-binary-text-classification-using-doc2vec-gensim

## Lodad data

In [3]:
df_topics = pd.read_csv('../datasets/ready2use/topics.csv', index_col=0)
df_topics.shape

(6541, 1)

In [4]:
df = pd.read_csv('../datasets/ready2use/text_celan_pl_dataset.csv',
                 index_col=0,
                 sep=';',header=None,names='sentence labels'.split())

df['sentence'] = df['sentence'].apply(lambda x: x.replace('\n',' ').strip())
df['labels'] = df['labels'].astype(int)

df = df.sample(frac=1, random_state=111)

df = df[df.index.isin(df_topics.index)]
df.shape

(6541, 2)

In [5]:
tok = AutoTokenizer.from_pretrained('allegro/herbert-large-cased')

In [6]:
labels = sorted(set(df['labels']))
num_labels = len(labels)
label2id = dict(zip(labels,range(num_labels)))
id2label = {v: k for k, v in label2id.items()}

def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    batch['labels'] = np.array(batch['labels'])
    return tok(text=batch['sentence'], max_length=64, truncation=True, padding="max_length")

### Kfold

In [7]:
cv_fold = []
cv_fold_i = []

for i in df_topics['topic'].unique().reshape(10,-1):
    train_cv = df_topics.index[ ~np.isin(df_topics["topic"], i) ].values
    test_cv = df_topics.index[ np.isin(df_topics["topic"], i) ].values
    
    train_cv_i = df_topics.reset_index().index[ ~np.isin(df_topics["topic"], i) ].values
    test_cv_i = df_topics.reset_index().index[ np.isin(df_topics["topic"], i) ].values
    
    cv_fold.append( [train_cv, test_cv])
    cv_fold_i.append( [train_cv_i, test_cv_i])

In [8]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(df_topics)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(df_topics):
    train_cv = df_topics.iloc[ train_index, : ].index.values
    test_cv = df_topics.iloc[ test_index, : ].index.values

    train_cv_i= df_topics.reset_index().iloc[ train_index, : ].index.values
    test_cv_i = df_topics.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, test_cv])
    cv_Kfold_i.append( [train_cv_i, test_cv_i])

## Experiments

In [9]:
config = BertConfig.from_pretrained(
    'allegro/herbert-large-cased',
    num_labels=num_labels
)

adapter_config = PfeifferConfig()


training_args = TrainingArguments(
    remove_unused_columns=False, # ensure the dataset labels are properly passed to the model
    
    fp16=True,
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    max_grad_norm=1.0,
    label_smoothing_factor=0.0,
    weight_decay=0.05,
    warmup_steps=600,
    output_dir="./training_output",
    overwrite_output_dir=True,
    
#     logging_strategy="no"
    
#     logging_steps=200,
#     logging_first_step=False,
#     logging_dir='./tb_logs',
#     evaluation_strategy='steps',
#     report_to='tensorboard',
#     save_steps=200
)

def compute_accuracy_f1score(p: EvalPrediction):
    preds = [id2label[pred_id] for pred_id in np.argmax(p.predictions, axis=1)]
    target_labels = [id2label[label_id] for label_id in p.label_ids]
    return {"accuracy": accuracy_score(target_labels,preds),
           "f1score": f1_score(target_labels,preds)}

In [10]:
cv = cv_fold

In [None]:
acc = []
f1 = []

j = 1

for train_cv, test_cv in cv:
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(df[df.index.isin(train_cv)])
    ds['valid'] = Dataset.from_pandas(df[df.index.isin(test_cv)])
    
    
    ds = ds.map(encode_batch, batched=True, batch_size=len(ds['train']))
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    
    model = BertAdapterModel.from_pretrained('allegro/herbert-large-cased', config=config).to('cuda')
    
    adapter_name='fake_news_'+str(j)
    
    model.add_adapter(adapter_name,config=adapter_config)
    model.add_classification_head(
        adapter_name,
        id2label=id2label,
        num_labels=num_labels
      )

    model.train_adapter(adapter_name)
    
    
    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        compute_metrics=compute_accuracy_f1score,
    )

    trainer.train()
    
    eval_out = trainer.evaluate()
    
    acc.append(eval_out['eval_accuracy'])
    f1.append(eval_out['eval_f1score'])
    
    j+=1

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at allegro/herbert-large-cased were not used when initializing BertAdapterModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 

Step,Training Loss
500,0.6638
1000,0.6029
1500,0.5605
2000,0.5335
2500,0.5169
3000,0.4935
3500,0.4593
4000,0.4342
4500,0.4053
5000,0.3708


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/fake_news_1/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/fake_news_1/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/fake_news_1/head_config.json
Module weights saved in ./training_output/checkpoint-500/fake_news_1/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/fake_news_1/head_config.json
Module weights saved in ./training_output/checkpoint-500/fake_news_1/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/fake_news_1/adapter_config.json
Module weights saved in ./training_output/checkpoint-1000/fake_news_1/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-1000/fake_news_1/head_config.json
Module weights saved in ./training_output/checkpoint-1000/fake_news_1/pytorch_model

Module weights saved in ./training_output/checkpoint-7000/fake_news_1/pytorch_model_head.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 595
  Batch size = 32


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading weights file https://huggingface.co/allegro/herbert-large-cased/resolve/main/pytorch_model.bin from cache at /home/marek/.cache/huggingface/transformers/ca58839b8e4b1222703e13158ffeb3a5a7330260cbc39513f74710674d70268b.ad71128a5739887a02bfa6de2fa8768f86e02cd13d0c308873f4cdba254e4c7c


In [None]:
acc = np.array(acc)
f1 = np.array(f1)

print(
    'adapters',
    f'Accuracy {acc.mean():.3f}+-{acc.std():.3f}',
    f'F1 Score {f1.mean():.3f}+-{f1.std():.3f}',
    f' {acc.mean():.3f}+-{acc.std():.3f} | {f1.mean():.3f}+-{f1.std():.3f}'
)

In [None]:
cv = cv_Kfold

In [None]:
acc = []
f1 = []

for train_cv, test_cv in cv:
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(df[df.index.isin(train_cv)])
    ds['valid'] = Dataset.from_pandas(df[df.index.isin(test_cv)])
    
    
    ds = ds.map(encode_batch, batched=True, batch_size=len(ds['train']))
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    
    model = BertAdapterModel.from_pretrained('allegro/herbert-large-cased', config=config).to('cuda')
    
    
    model.add_adapter(adapter_name,config=adapter_config)
    model.add_classification_head(
        adapter_name,
        id2label=id2label,
        num_labels=num_labels
      )

    model.train_adapter(adapter_name)
    
    
    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["valid"],
        compute_metrics=compute_accuracy_f1score,
    )

    trainer.train()
    
    eval_out = trainer.evaluate()
    
    acc.append(eval_out['eval_accuracy'])
    f1.append(eval_out['eval_f1score'])

In [None]:
acc = np.array(acc)
f1 = np.array(f1)

print(
    'adapters',
    f'Accuracy {acc.mean():.3f}+-{acc.std():.3f}',
    f'F1 Score {f1.mean():.3f}+-{f1.std():.3f}',
    f' {acc.mean():.3f}+-{acc.std():.3f} | {f1.mean():.3f}+-{f1.std():.3f}'
)