Short script to create and save the huggingface pipeline in a folder

Also save the tokenizer to local storage


In [1]:
from datetime import datetime
from pathlib import Path
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# merely save the bert-base-cased tokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer.save_pretrained("./tokenizer_bert-base-cased")

('./tokenizer_bert-base-cased/tokenizer_config.json',
 './tokenizer_bert-base-cased/special_tokens_map.json',
 './tokenizer_bert-base-cased/vocab.txt',
 './tokenizer_bert-base-cased/added_tokens.json',
 './tokenizer_bert-base-cased/tokenizer.json')

---

Pipeline creation

In [4]:
DATASET_SIZE = 240
DATASET_IS_BALANCED = True

training_name = 'bert-finetune_{}k_{}'.format(
    DATASET_SIZE,
    'bal' if DATASET_IS_BALANCED else 'imbal')

training_args_datetime = datetime(year=2023, month=12, day=18)


training_storing_folder = Path(f'{training_name}/').resolve()
if not training_storing_folder.exists():
    print('Folder does not exist.')


training_name = training_name + '_' + training_args_datetime.strftime("%Y-%m-%d")

In [5]:
from transformers import pipeline

my_pipeline = pipeline(
    'text-classification',
    model=AutoModelForSequenceClassification.from_pretrained(
        str(Path.joinpath(training_storing_folder, training_name+'_model'))),
    tokenizer=AutoTokenizer.from_pretrained("bert-base-cased")
)

my_pipeline.save_pretrained(str(Path.joinpath(training_storing_folder, training_name+'_pipeline')))

print('PIPELINE SAVED')
print('\n\n')

PIPELINE SAVED



