# Przetwarzanie języka naturalnego – lab9
## Mateusz Kocot

In [1]:
import json
from dataclasses import dataclass
import random
from pprint import pprint

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets

import pandas as pd

In [2]:
QUESTIONS_PATH = '../../simple-legal-questions-pl/questions.jl'
RELEVANT_PATH = '../../simple-legal-questions-pl/relevant.jl'
PASSAGES_PATH ='../../simple-legal-questions-pl/passages.jl'
ANSWERS_PATH = '../../simple-legal-questions-pl/answers.jl'

TEST_RANGE = (1345, 1366) # left: inclusive, right: exclusive

# Load and preprocess data

In [3]:
def load_jl(path, dict_key=None, keys_to_delete=[]):
    with open(path, encoding='utf-8') as file:
        lines = [json.loads(line) for line in file]

    if dict_key is not None:
        lines_dict = {}
        for line in lines:
            key = line[dict_key]
            del line[dict_key]
            lines_dict[key] = line
        lines = lines_dict
        
    if keys_to_delete:
        lines_it = lines.values() if type(lines) == dict else lines
        for line in lines_it:
            for key in keys_to_delete:
                del line[key]
        
    return lines
    
questions_jl = load_jl(QUESTIONS_PATH, dict_key='_id')
relevant_jl = load_jl(RELEVANT_PATH, dict_key='question-id', keys_to_delete=['score'])
passages_jl = load_jl(PASSAGES_PATH, dict_key='_id')
answers_jl = load_jl(ANSWERS_PATH, dict_key='question-id', keys_to_delete=['score'])

In [4]:
@dataclass
class Question:
    id_: str
    question_text: str
    passage: str
    answer: str
    title: str

In [5]:
questions_dataset = {}
for id_, question_dict in questions_jl.items():
    # do not process if there is no answer or the answer is empty
    if id_ not in answers_jl or 'answer' not in answers_jl[id_] or not answers_jl[id_]['answer']:
        continue

    question_text = question_dict['text']
    passage_id = relevant_jl[id_]['passage-id']
    passage = passages_jl[passage_id]['text']
    answer = answers_jl[id_]['answer']
    title = passages_jl[passage_id]['title']
    
    question = Question(id_, question_text, passage, answer, title)
    questions_dataset[id_] = question
    
len(questions_dataset)

264

# Task 3

In [6]:
questions_test = [questions_dataset[str(id_)] for id_ in range(*TEST_RANGE)]
questions_test_set = {q.question_text for q in questions_test}

len(questions_test), len(questions_test_set)

(21, 19)

In [7]:
questions_pretrain = {}
for id_, question in questions_dataset.items():
    if question.question_text not in questions_test_set:
        questions_pretrain[id_] = question
        
len(questions_pretrain)

242

# Task 4

In [8]:
random.seed(421)
questions_val = list(questions_pretrain.values())
random.shuffle(questions_val)

l = int(0.2 * len(questions_pretrain))
questions_val, _ = questions_val[:l], questions_val[l:]

questions_val_set = {q.question_text for q in questions_val}

len(questions_val), len(questions_val_set)

(48, 44)

In [9]:
questions_train = {}
for id_, question in questions_pretrain.items():
    if question.question_text not in questions_val_set:
        questions_train[id_] = question
        
questions_train = list(questions_train.values())
len(questions_train)

167

# Task 5
Use 2000 questions from SQUAD

In [10]:
squad_dataset = load_dataset('squad')

Found cached dataset squad (C:/Users/MatiX/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [12]:
squad_dataset_train = Dataset.from_dict(squad_dataset['train'][:2000]) # take only 2000
squad_dataset_train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2000
})

# Transform to the squad format

In [13]:
def to_squad(dataset: list[Question]):
    result = {
        'answers': [],
        'context': [],
        'id': [],
        'question': [],
        'title': []
    }
    for question in dataset:
        result['answers'].append({
            'answer_start': [0], # not provided in the dataset
            'text': [question.answer]
        })
        result['context'].append(question.passage)
        result['id'].append(question.id_)
        result['question'].append(question.question_text)
        result['title'].append(question.title)
    return Dataset.from_dict(result)

datasets = DatasetDict({
    'train': concatenate_datasets([to_squad(questions_train), squad_dataset_train]),
    'validation': to_squad(questions_val),
    'test': to_squad(questions_test)
})

In [14]:
datasets

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 2167
    })
    validation: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 48
    })
    test: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 21
    })
})

In [15]:
print(datasets['train'])
pprint(datasets['train'][0])

Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 2167
})
{'answers': {'answer_start': [0],
             'text': ['postępowanie wszczęte w wyniku dokonania zgłoszenia '
                      'lub złożenia wniosku podlega umorzeniu, bądź '
                      'czynność uzależniona od opłaty zostaje zaniechana.']},
 'context': 'Art. 223. 1. Opłaty jednorazowe za zgłoszenia, wnioski, '
            'oświadczenia i inne czynności przewidziane w ustawie powinny być '
            'uiszczane z góry, o ile ustawa lub rozporządzenie, o którym mowa '
            'w art. 222 ust. 3, nie przewiduje uiszczenia opłaty na wezwanie '
            'Urzędu Patentowego w określonym terminie. 2. Opłata jednorazowa '
            'za zgłoszenie może być również uiszczona w ciągu jednego miesiąca '
            'od daty doręczenia wezwania Urzędu Patentowego. 3. Jeżeli w '
            'wyniku złożonego wniosku o ponowne rozpatrzenie sprawy decyzja '
            '

In [16]:
datasets.save_to_disk('data/datasets')

Saving the dataset (0/1 shards):   0%|          | 0/2167 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/48 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21 [00:00<?, ? examples/s]

# Task 6
Changes in *run_seq2seq_qa.py*:
- Replaced `load_dataset` with `load_from_disk`.

Used three models: allegro/plt5-small, allegro/plt5-base and google/mt5-small (larger models did not fit into my 8GB VRAM). Fine-tuning with following commands.

```bash
python src/run_seq2seq_qa.py \
    --model_name_or_path allegro/plt5-small \
    --dataset_name data/datasets \
    --do_train \
    --do_eval \
    --do_predict \
    --predict_with_generate \
    --per_device_train_batch_size 8 \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --output_dir ./output/allegro_small/ \
    --overwrite_output_dir
```

```bash
python src/run_seq2seq_qa.py \
    --model_name_or_path allegro/plt5-base \
    --dataset_name data/datasets \
    --do_train \
    --do_eval \
    --do_predict \
    --predict_with_generate \
    --per_device_train_batch_size 3 \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --output_dir ./output/allegro_base/ \
    --overwrite_output_dir
```

```bash
python src/run_seq2seq_qa.py \
    --model_name_or_path google/mt5-small \
    --dataset_name data/datasets \
    --do_train \
    --do_eval \
    --do_predict \
    --predict_with_generate \
    --per_device_train_batch_size 6 \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --output_dir ./output/google_small/ \
    --overwrite_output_dir
```

# Task 7 & 8
- best results with allegro_base (allegro/plt5-base)

In [17]:
model_paths = ['output/allegro_small', 'output/allegro_base', 'output/google_small']
model_names = ['allegro_small', 'allegro_base', 'google_small']

eval_exact_match = []
eval_f1 = []
test_exact_match = []
test_f1 = []

for path, model_name in zip(model_paths, model_names):
    with open(path + '/all_results.json', encoding='utf-8') as file:
        results = json.load(file)
    eval_exact_match.append(results['eval_exact_match'])
    eval_f1.append(results['eval_f1'])
    test_exact_match.append(results['test_exact_match'])
    test_f1.append(results['test_f1'])
    
pd.DataFrame({'model name': model_names, 'val_exact_match': eval_exact_match, 'eval_f1': eval_f1, 'test_exact_match': test_exact_match, 'test_f1': test_f1})

Unnamed: 0,model name,val_exact_match,eval_f1,test_exact_match,test_f1
0,allegro_small,0.0,3.55564,0.0,2.721088
1,allegro_base,0.0,10.135155,0.0,6.052358
2,google_small,0.0,3.461071,0.0,5.614851


# Task 9

In [18]:
best_model_path = 'output/allegro_base'
best_model_name = 'allegro/plt5-base'

tokenizer = AutoTokenizer.from_pretrained(best_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(best_model_path)

# tokenizer = AutoTokenizer.from_pretrained("allegro/plt5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("allegro/plt5-base")

In [19]:
def predict_answer(question, context):
    inputs = tokenizer(f"question: {question} context: {context}", return_tensors='pt')
    outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [20]:
for d in datasets['test']:
    print('Question:', d['question'])
    print('Expected answer:', d['answers']['text'][0])
    answer = predict_answer(d['question'], d['context'])
    print('Returned answer:', answer)
    print('==================================================')

Question: Czy źródła finansowania partii politycznych mogą być niejawne?
Expected answer: Źródła finansowania partii politycznych są jawne.




Returned answer: 規. 2. W przypadku niezłożenia informacji o źródłach finansowania partii politycznych Sąd wydaje postanowienie o w
Question: Czy wnuki przyjęte na wychowanie mają prawo do renty po zmarłych rodzicach?
Expected answer: Nie
Returned answer: 規: art. 68.̋.. context:誰.. context
Question: Kiedy kurator składa wniosek o ogłoszenie upadłości osoby prawnej?
Expected answer: jeżeli stwierdzi, że istnieją podstawy do ogłoszenia upadłości.
Returned answer: 規: art. 30.̋. Kurator: art. 30.
Question: Czy dawka promieniowania jonizującego pochodzącego ze źródeł naturalnych może przekroczyć dawkę graniczną?
Expected answer: Nie
Returned answer: 規. §2. Dawka promieniowania jonizującego pochodzącego ze źródeł naturalnych nie może przekroczyć dawki
Question: Czy dawka promieniowania jonizującego pochodzącego ze źródeł naturalnych może przekroczyć dawkę graniczną?
Expected answer: Nie
Returned answer: 規. Art. 13. 1. Dawka graniczna nie może przekroczyć dawki granicznej. 2. Dawki
Question: J

# Task 11
#### Which pre-trained model performs better on that task?
#### Does the performance on the validation dataset reflects the performance on your test set?
#### What are the outcomes of the model on your own questions? Are they satisfying? If not, what might be the reason for that?
#### Why extractive question answering is not well suited for inflectional languages?
#### Why you have to remove the duplicated questions from the training and the validation subsets?