In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 20 13:25:49 2022

@author: maxpr
"""
import json
from pathlib import Path

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import DataLoader
from transformers import AdamW


def read_data(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers


In [2]:
train_contexts, train_questions, train_answers = read_data('privacyqa/policy_train_squad.json')
val_contexts, val_questions, val_answers = read_data('privacyqa/policy_test_squad.json')

print(f"There are {len(train_questions)} questions in the training dataset")
print(f"There are {len(val_questions)} questions in the validation dataset")

print(train_contexts[1])
print(train_questions[1])
print(train_answers[1])

There are 6775 questions in the training dataset
There are 1987 questions in the validation dataset
We encourage you to review this privacy policy periodically. We collect, store and use your data in order to operate our business and to provide our products and services to you. We may use information we collect about you to: Deliver, improve, debug, and maintain our products and services, including future products and services. Study and personalize user experiences. Perform analysis about your use of, or interest in, our or others products, services, or content, including mobile analytics. Develop, display, and track content and advertising tailored to your interests on our service and other sites, including providing our advertisements to you when you visit other sites. Fulfill legal requirements. Conduct business analysis and research, and marketing campaigns. Ensure better security and fraud protection. Perform functions or services described to you at the time of collection. We ma

In [None]:
#### Get end index of answers

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

print(f"Added end index to training and validation datasets")


In [5]:

#### Get tokenizer

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

print(f"Tokenized training and validataion datasets")

#### Get end index of answers

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        print(answers[i])
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

print(f'Created token positions for training and validation datasets')


Tokenized training and validataion datasets
{'text': 'In normal browsing mode, the Cake browser may store your browsing history (URLs of pages that you visit, your IP address, a cache of text, images and other resources from those pages) on your local device, in an anonymous fashion that is not tied to your identity.', 'answer_start': 1060}


KeyError: 'answer_end'

In [5]:
# Create Dataset Classes

class PolicyQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings.input_ids)


train_dataset = PolicyQADataset(train_encodings)
val_dataset = PolicyQADataset(val_encodings)

print(f'Created training and validation datasets')


Created training and validation datasets


In [6]:
bert_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

print("Starting training")

training_args = TrainingArguments(
    output_dir='./final_training_results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./final_training_logs',            # directory for storing logs
    logging_steps=10,
    
)

trainer = Trainer(
    model=bert_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
    
)

# device = torch.device('cuda:0')
# print(device)

trainer.train()        



Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

Starting training


***** Running training *****
  Num examples = 26861
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 16790
  Number of trainable parameters = 66364418


Step,Training Loss
10,6.1044
20,6.0676
30,5.9998
40,5.928
50,5.7903
60,5.555
70,5.1818
80,4.7867
90,4.4398
100,4.367


Saving model checkpoint to ./final_training_results\checkpoint-500
Configuration saved in ./final_training_results\checkpoint-500\config.json
Model weights saved in ./final_training_results\checkpoint-500\pytorch_model.bin


KeyboardInterrupt: 

In [12]:
from pathlib import Path
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
from transformers import pipeline

# Replace this with your own checkpoint
# Replace with your custom model of choice
model_checkpoint ='final_training_results\checkpoint-273500'
# model = DistilBertForQuestionAnswering.from_pretrained(Path('final_training_results\checkpoint-273500'))
# tokenizer = AutoTokenizer.from_pretrained('final_training_results\checkpoint-273500\tokenizer.json')

question_answerer = pipeline(task='question-answering',  # replace with whatever task you have
                             model=model_checkpoint,)

# model_checkpoint = "huggingface-course/bert-finetuned-squad"
# question_answerer = pipeline("question-answering", model=model_checkpoint)

context = """
We may partner with third party advertising companies to better provide advertisements about goods and services that may be of interest to you. These third parties may use cookies alone or in conjunction with web beacons or other tracking technologies to collect information about you when you use the Sites. They may collect information about your online activities over time and across different websites and other online services. They may use this information to provide you with interest-based advertising or other targeted content. These advertising partners do not have access to or use your name, address, e-mail address, telephone number or other personally identifiable information from us, without your consent. They may, however, use persistent identifiers to anonymously track your Internet usage across other websites in their networks beyond these Sites. While we restrict their further use of such information, such third parties may, with sufficient data from other sources, be able to personally identify you, unknown to us.
"""
question = "What are the ways third parties obtain our information?"
question_answerer(question=question, context=context)

{'score': 0.9966630339622498,
 'start': 173,
 'end': 252,
 'answer': 'cookies alone or in conjunction with web beacons or other tracking technologies'}

In [20]:
from pathlib import Path
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel
# Replace this with your own checkpoint
# Replace with your custom model of choice
model_checkpoint ='nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
question_answerer = pipeline(task='question-answering',  # replace with whatever task you have
                             model=model_checkpoint,)

# model_checkpoint = "huggingface-course/bert-finetuned-squad"
# question_answerer = pipeline("question-answering", model=model_checkpoint)

context = """
We may partner with third party advertising companies to better provide advertisements about goods and services that may be of interest to you. These third parties may use cookies alone or in conjunction with web beacons or other tracking technologies to collect information about you when you use the Sites. They may collect information about your online activities over time and across different websites and other online services. They may use this information to provide you with interest-based advertising or other targeted content. These advertising partners do not have access to or use your name, address, e-mail address, telephone number or other personally identifiable information from us, without your consent. They may, however, use persistent identifiers to anonymously track your Internet usage across other websites in their networks beyond these Sites. While we restrict their further use of such information, such third parties may, with sufficient data from other sources, be able to personally identify you, unknown to us.
"""
question = "What are the ways third parties obtain our information?"
question_answerer(question=question, context=context)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertF

{'score': 5.261138721834868e-05,
 'start': 648,
 'end': 700,
 'answer': 'or other personally identifiable information from us'}