In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased").cuda()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.10.attention.output.LayerNorm.weight', 'encoder.layer.7.output.dense.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.6.output.dense.weight', 'encoder.layer.5.attention.self.key.weight', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.7.attention.self.value.bias', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.4.attention.output.dense.bias', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.8

In [7]:
dataset = load_dataset("json", data_files='train.jsonl')
dataset.with_format("torch")
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'claim', 'label'],
        num_rows: 132448
    })
    test: Dataset({
        features: ['document', 'claim', 'label'],
        num_rows: 33113
    })
})

In [4]:
dataset['train'][:5]

{'document': ['In jurisdictions using a common law system, the doctrine of stare decisis applies, whereby the principles applied by the supreme court in its decisions are binding upon all lower courts; this is intended to apply a uniform interpretation and implementation of the law. In civil law jurisdictions the doctrine of stare decisis is not generally considered to apply, so the decisions of the supreme court are not necessarily binding beyond the immediate case before it; however, in practice the decisions of the supreme court usually provide a very strong precedent, or jurisprudence constante, for both itself and all lower courts.',
  "The cantons have a permanent constitutional status and, in comparison with the situation in other countries, a high degree of independence. Under the Federal Constitution, all 26 cantons are equal in status. Each canton has its own constitution, and its own parliament, government and courts. However, there are considerable differences between the i

In [5]:
dataset['train'].features

{'document': Value(dtype='string', id=None),
 'claim': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None)}

In [6]:
dataset['train'][0]

{'document': 'In jurisdictions using a common law system, the doctrine of stare decisis applies, whereby the principles applied by the supreme court in its decisions are binding upon all lower courts; this is intended to apply a uniform interpretation and implementation of the law. In civil law jurisdictions the doctrine of stare decisis is not generally considered to apply, so the decisions of the supreme court are not necessarily binding beyond the immediate case before it; however, in practice the decisions of the supreme court usually provide a very strong precedent, or jurisprudence constante, for both itself and all lower courts.',
 'claim': 'What does stare decisis say about decisions made by lower courts in regard the supreme court?',
 'label': 'NEI'}

In [7]:
# max_length = 0
# for sen in dataset['train']['document']:
#     length = len(tokenizer.tokenize(sen))
#     max_length = max(length, max_length)
# max_length

Token indices sequence length is longer than the specified maximum sequence length for this model (700 > 512). Running this sequence through the model will result in indexing errors


853

In [8]:
def create_input_sentence(document, claim):
    return f"Given claim-document pair where claim: \"{claim}\", document: \"{document}\". Classify the claim to which class it belongs. If the claim contains information about the document, its label will be SUPPORTED, otherwise, its label will be REFUTED. In case the information of the claim cannot be verified based on the given document, its label will be NEI"

In [9]:
print(create_input_sentence(dataset['train'][100]['document'], dataset['train'][100]['claim']))

Given claim-document pair where claim: "When did Patterson join the National Cash Register Company?", document: "Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was offered CTR. Watson joined CTR as General Manager then, 11 months later, was made President when court cases relating to his time at NCR were resolved. Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies. He implemented sales conventions, "generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker". His favorite slogan, "THINK", became a mantra for each company's employees. During Watson's first four years, revenues more than doubled to $9 million and the company's operations expanded to Europe, South America, Asia and Australia. "Watson had never l

In [10]:
def preprocess_function(examples):
    inputs = tokenizer.encode_plus(
        create_input_sentence(examples["claim"], examples["document"]),
        truncation=True,
        padding="max_length",
        return_tensors='pt'
    )
    label = tokenizer.encode_plus(
        examples["label"],
        truncation=True,
        padding="max_length",
        return_tensors='pt'
    )

    examples["input_ids"] = inputs['input_ids'][0]
    examples["attention_mask"] = inputs['attention_mask'][0]

    examples['labels'] = label['input_ids'][0]
    
    return examples

In [11]:
print(preprocess_function(dataset['train'][100]))

{'document': 'Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was offered CTR. Watson joined CTR as General Manager then, 11 months later, was made President when court cases relating to his time at NCR were resolved. Having learned Patterson\'s pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR\'s companies. He implemented sales conventions, "generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker". His favorite slogan, "THINK", became a mantra for each company\'s employees. During Watson\'s first four years, revenues more than doubled to $9 million and the company\'s operations expanded to Europe, South America, Asia and Australia. "Watson had never liked the clumsy hyphenated title of the CTR" and chose to replace it with the more expansive t

In [12]:
train_dataset = dataset["train"].map(preprocess_function, remove_columns=dataset["train"].column_names)
test_dataset = dataset["test"].map(preprocess_function, remove_columns=dataset["test"].column_names)

Map:   0%|          | 0/132448 [00:00<?, ? examples/s]

Map: 100%|██████████| 132448/132448 [04:35<00:00, 481.47 examples/s]
Map: 100%|██████████| 33113/33113 [01:02<00:00, 525.90 examples/s]


In [13]:
# from transformers import DefaultDataCollator

# data_collator = DefaultDataCollator()

In [15]:
training_args = TrainingArguments(
    output_dir="dsc_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    # data_collator=data_collator,
)

trainer.train()

d:\Code\temp\temp py\HuggingFace\dsc_model is already a clone of https://huggingface.co/NgThVinh/dsc_model. Make sure you pull the latest changes with `repo.git_pull()`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 6.00 GiB total capacity; 12.07 GiB already allocated; 0 bytes free; 12.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="dsc_model")
QA_input = {
    'claim': 'Why is model conversion important?',
    'document': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
question_answerer(QA_input)