In [121]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import re
import string
import random

In [20]:
def load_json_file(file_path):
    if file_path.endswith('.jl'):
        return pd.read_json(file_path, lines=True)
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

train_path = "poquad-train.json"
val_path = "poquad-dev.json"
train_data = load_json_file(train_path)['data']
val_data = load_json_file(val_path)['data']

questions_path = "lab9_data/questions.jl"
answers_path = "lab9_data/answers.jl"
passages_path = "lab9_data/passages.jl"
relevant_path = "lab9_data/relevant.jl"
questions = load_json_file(questions_path)
answers = load_json_file(answers_path)
passages = load_json_file(passages_path)
relevant = load_json_file(relevant_path)

In [5]:
def preprocess_data(data):
    examples = []
    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if 'answers' in qa and qa['answers']:
                    for answer in qa['answers']:
                        output_text = answer['text']
                        input_text = f"Pytanie: {question}; kontekst: {context}"
                        examples.append((input_text, output_text))
    return examples

qa_train = preprocess_data(train_data)
qa_val = preprocess_data(val_data)
print(len(qa_train))
print(len(qa_val))

qa_val_df = pd.DataFrame(qa_val, columns=['question','answer'])
qa_train_df = pd.DataFrame(qa_train, columns=['question','answer'])

46187
5764


I am using the trained model:

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("apohllo/plt5-base-poquad")
model = AutoModelForSeq2SeqLM.from_pretrained("apohllo/plt5-base-poquad")

In [66]:
qa = pd.merge(questions, answers, left_on="_id", right_on="question-id")
qa = pd.merge(qa, relevant, on="question-id")
qa = pd.merge(qa, passages, left_on="passage-id", right_on="_id")
qa = qa[['text_x', 'answer', 'text_y']]
qa.columns=['question', 'answer', 'context']
qa["question"] = qa.apply(lambda row: f"pytanie: {row['question']}; kontekst: {row['context']}", axis=1)
qa.drop(columns='context')

Unnamed: 0,question,answer
0,"pytanie: Czy żołnierz, który dopuszcza się czy...","Tak, podlega karze aresztu wojskowego albo poz..."
1,pytanie: Z ilu osób składa się komisja przetar...,Komisja przetargowa składa się z co najmniej t...
2,pytanie: Do jakiej wysokości za zobowiązania s...,Komandytariusz odpowiada za zobowiązania spółk...
3,pytanie: Kiedy ustala się wartość majątku obro...,Wartość rzeczowych składników majątku obrotowe...
4,"pytanie: Jakiej karze podlega armator, który w...",Podlega karze pieniężnej do wysokości 1 000 00...
...,...,...
676,pytanie: Jakim przepisom podlegają przychody k...,"ogólnym przepisom podatkowym, z wyjątkami okre..."
677,pytanie: Jakim przepisom podlegają przychody k...,"ogólnym przepisom podatkowym, z wyjątkami okre..."
678,pytanie: Jakim przepisom podlegają przychody k...,"ogólnym przepisom podatkowym, a w szczególnośc..."
679,pytanie: Jakim przepisom podlegają przychody k...,"ogólnym przepisom podatkowym, z wyjątkami okre..."


In [70]:
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text

def compute_metrics(reference_answers, predicted_answers):
    exact_matches = 0
    total = len(reference_answers)
    f1_scores = []
    
    for ref, pred in zip(reference_answers, predicted_answers):

        # Preprocessing to skip punctuation marks and converting to lower
        ref = preprocess(ref)
        pred = preprocess(pred)
        
        ref_tokens = ref.split()
        pred_tokens = pred.split()

        if ref == pred:
            exact_matches += 1
        
        # F1 score
        common = set(ref_tokens) & set(pred_tokens)
        precision = len(common) / len(pred_tokens) if pred_tokens else 0
        recall = len(common) / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        f1_scores.append(f1)
    
    # metrics computing
    exact_match = (exact_matches / total) * 100
    avg_f1 = (sum(f1_scores) / total) * 100
    
    return {"EM": round(exact_match,4), "F1": round(avg_f1,4)}

In [105]:
def generate_answers(questions):
    generated_answers = []

    for i, question in enumerate(questions):
        inputs = tokenizer(question, return_tensors="pt")
        outputs = model.generate(**inputs, max_length=50)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_answers.append(answer)
    
        # Printing progress
        if i % 100 == 0 and i > 0:
            print(f"Processed {round(i/len(questions)*100,2)}% questions so far")


    return generated_answers

In [74]:
qa_questions = list(qa['question'])
qa_correct_answers = list(qa['answer'])
qa_generated_answers = generate_answers(qa_questions)

Processed 14.68% questions so far
Processed 29.37% questions so far
Processed 44.05% questions so far
Processed 58.74% questions so far
Processed 73.42% questions so far
Processed 88.11% questions so far


In [76]:
compute_metrics(qa_generated_answers, qa_correct_answers)

{'EM': 24.6696, 'F1': 43.8787}

| **Metric** | **Score (%)** |
|------------|---------------|
| Exact Match (EM) | 24.6696          |
| F1 Score        | 43.8787          |

In [80]:
qa['generated_answer'] = qa_generated_answers

In [107]:
qa_val_questions = list(qa_val_df['question'])
qa_val_correct_answers = list(qa_val_df['answer'])
qa_val_generated_answers = generate_answers(qa_val_questions)

Processed 1.73% questions so far
Processed 3.47% questions so far
Processed 5.2% questions so far
Processed 6.94% questions so far
Processed 8.67% questions so far
Processed 10.41% questions so far
Processed 12.14% questions so far
Processed 13.88% questions so far
Processed 15.61% questions so far
Processed 17.35% questions so far
Processed 19.08% questions so far
Processed 20.82% questions so far
Processed 22.55% questions so far
Processed 24.29% questions so far
Processed 26.02% questions so far
Processed 27.76% questions so far
Processed 29.49% questions so far
Processed 31.23% questions so far
Processed 32.96% questions so far
Processed 34.7% questions so far
Processed 36.43% questions so far
Processed 38.17% questions so far
Processed 39.9% questions so far
Processed 41.64% questions so far
Processed 43.37% questions so far
Processed 45.11% questions so far
Processed 46.84% questions so far
Processed 48.58% questions so far
Processed 50.31% questions so far
Processed 52.05% quest

In [109]:
compute_metrics(qa_val_correct_answers, qa_val_generated_answers)

{'EM': 42.3144, 'F1': 59.7899}

| **Metric** | **Score (%)** |
|------------|---------------|
| Exact Match (EM) | 42.3144         |
| F1 Score        | 59.7899          |

In [207]:
qa_val_df['generated answer'] = qa_val_generated_answers

In [193]:
def get_f1_scores(reference_answers, predicted_answers):
    total = len(reference_answers)
    f1_scores = []
    
    for ref, pred in zip(reference_answers, predicted_answers):

        # Preprocessing to skip punctuation marks and converting to lower
        ref = preprocess(ref)
        pred = preprocess(pred)
        
        ref_tokens = ref.split()
        pred_tokens = pred.split()
        
        # F1 score
        common = set(ref_tokens) & set(pred_tokens)
        precision = len(common) / len(pred_tokens) if pred_tokens else 0
        recall = len(common) / len(ref_tokens) if ref_tokens else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        f1_scores.append(f1)

    return f1_scores

In [195]:
scores = get_f1_scores(qa_val_generated_answers, qa_val_correct_answers) 

In [199]:
qa_val_df['f1-score'] = scores

In [209]:
qa_val_df[qa_val_df['f1-score'] == 1.0]

Unnamed: 0,question,answer,f1-score,generated answer
1,Pytanie: Z ilu komponentów składała się Tora p...,dwóch,1.0,dwóch
4,Pytanie: Kto początkowo należał do oddziału st...,"280 strzelców, kilkuset chłopów kosynierów i 6...",1.0,"280 strzelców, kilkuset chłopów kosynierów i 6..."
5,Pytanie: Kiedy Plater wraz z oddziałem dotarła...,29 marca 1831,1.0,29 marca 1831
10,Pytanie: Wynagrodzenie w jakiej wysokości otrz...,pięćset tysięcy funtów,1.0,pięćset tysięcy funtów
14,Pytanie: Co oprócz placków ziemniaczanych smaż...,"pączki, faworki i bliny",1.0,"pączki, faworki i bliny"
...,...,...,...,...
5732,Pytanie: Jak wyglądała próba zmierzenia się te...,nagrywając koncert na orkiestrę i grupę rockową,1.0,nagrywając koncert na orkiestrę i grupę rockową
5739,Pytanie: W jakim języku pisał Agatiasz?; konte...,attyckim,1.0,attyckim
5748,Pytanie: Na jakim stanowisku pracował w klinic...,młodszego rezydenta,1.0,młodszego rezydenta
5755,Pytanie: Kto stanął na czele nowo powstałej ra...,Walancina Żukouska,1.0,Walancina Żukouska


In [127]:
random_ids = []
for i in range(10):
    random_id = random.randint(0, len(qa)-1)
    random_ids.append(random_id)

In [167]:
for i, id in enumerate(random_ids):
    item = qa.iloc[id]  # Zakładam, że `qa` to DataFrame pandas
    question = item['question'].split(";")[0]  # Bierzemy tylko część przed średnikiem
    print(i+1, question.strip(), "\n")  # Usuń zbędne spacje, jeśli są
    print("GENERATED ANSWER: ", item['generated_answer'])
    print("CORRECT ANSWER: ", item['answer'], "\n")

1 pytanie: Jakim warunkom powinny odpowiadać przekazywane gminie urządzenia wodociągowe? 

GENERATED ANSWER:  technicznym określonym w odrębnych przepisach
CORRECT ANSWER:  Przekazywane urządzenia powinny odpowiadać warunkom technicznym określonym w odrębnych przepisach. 

2 pytanie: Czy za użytkowanie wieczyste gruntów przez zakłady charytatywno-opiekuńcze pobiera się opłaty? 

GENERATED ANSWER:  nie
CORRECT ANSWER:  Nie. 

3 pytanie: Co robi się z każdorazowym wypadkiem zastosowania tymczasowego aresztowania wobec obywatela państwa obcego? 

GENERATED ANSWER:  zawiadamia się niezwłocznie właściwy miejscowo urząd konsularny tego państwa - lub w braku takiego urzędu - przedstawicielstwo dyplomatyczne tego państwa
CORRECT ANSWER:  W przypadku każdorazowego wypadku zastosowania tymczasowego aresztowania wobec obywatela państwa obcego, należy zawiadomić konsula tego państwa o fakcie zastosowania aresztowania (zgodnie z ustawą o postępowaniu wobec cudzoziemców). 

4 pytanie: Na jakie cele 

We can see that the model is capable of answering questions, but it does so in a minimalist and straightforward manner. It operates like an Extractive QA system because it doesn’t provide full-sentence answers; instead, it returns only the most relevant fragments from the context provided for the question. Generally, it answers correctly, although it does make mistakes (for example, it gave incorrect answers to questions 6 and 7, even though the answer is a single word).

## Questions

### 1. Does the performance on the validation dataset reflects the performance on your test set?

The performance on the test and validation datasets present as follows:

| **Dataset** | **Metric** | **Score (%)** |
|------------|------------|---------------|
| Test | Exact Match (EM) | 24.6696          |
| Test | F1 Score        | 43.8787          |
| Validation | Exact Match (EM) | 42.3144         |
| Validation | F1 Score        | 59.7899         |

In my opiniom the differences are significant enough that it is not possible to definitively state that the validation set adequately reflects the model's later performance. 

### 2. What are the outcomes of the model on your test questions? Are they satisfying? If not, what might be the reason for that?

The model performs quite well. Although the metric values are not very high, we must remember that this is not a typical classification task, and the way the metrics are calculated heavily depends on the specific answer someone considers correct. In fact, we could equally well phrase a correct answer in a slightly different way, and it would not necessarily have an F1-score of 100%. The model doesn't generate full-sentence answers but instead extracts key fragments from the context. It occasionally makes mistakes, but in my opinion, this model is already on the edge of being usable for cases where we don't need to be absolutely certain about the information we obtain. The observed differences between datasets may result from different grammatical structures in the questions, or from more diverse questions in the test dataset, even though they concern the same topic.

### 3. Why extractive question answering is not well suited for inflectional languages?

In inflectional languages, the issue lies in the quality control of answers. When comparing individual tokens, we may fail to match the same word in a different form, which means we won't consider it correct. Additionally, in such languages, changes in grammatical forms can significantly impact meaning, making it more difficult to correctly match answers in extractive QA tasks.