In [1]:
from elasticsearch import Elasticsearch
from tqdm import tqdm
from collections import defaultdict
from elasticsearch.helpers import streaming_bulk
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration

# Dataset

In [2]:
with open('../Data/task2_questions_with_answers.tsv', 'r', encoding='UTF-8') as f:
    question_answers = []
    for line in f:
        splitted = line.strip().split("\t")
        question_answers.append((splitted[0], splitted[1:]))

In [90]:
questions = [qa[0] for qa in question_answers]

# Pretrained models

In [80]:
yes_no_model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')
yes_no_tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
# yes_no_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
# yes_no_model = AutoModelWithLMHead.from_pretrained("allegro/herbert-large-cased")
_ = yes_no_model.eval()



In [81]:
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [5]:
qa_pipeline_herbert = pipeline(
    "question-answering",
    model='azwierzc/herbert-large-poquad', handle_impossible_answer=True,
    tokenizer='azwierzc/herbert-large-poquad'
)

def get_answer_herbert(context, question):
    return qa_pipeline_herbert({
        'context': context,
        'question': question})

In [104]:
# # PLT5 Large is about 3x slower than PLT5 Base
# plt5_tokenizer = AutoTokenizer.from_pretrained("azwierzc/plt5-large-poquad")
# plt5_model = T5ForConditionalGeneration.from_pretrained("azwierzc/plt5-large-poquad")
# _ = plt5_model.eval()
plt5_tokenizer = AutoTokenizer.from_pretrained("azwierzc/plt5-base-poquad")
plt5_model = T5ForConditionalGeneration.from_pretrained("azwierzc/plt5-base-poquad")
_ = plt5_model.eval()

Downloading:   0%|          | 0.00/407 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/802 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

In [71]:
def get_answer_plt5(context, question):
    query = plt5_tokenizer(f"question: {question}, context: {context}", 
                           max_length=512, truncation=True, return_tensors="pt")
    output = plt5_model.generate(**query, max_new_tokens=8, num_beams=5, return_dict_in_generate=True, output_scores=True)
    decoded_output = plt5_tokenizer.decode(output.sequences[0], skip_special_tokens=True)
    score = np.exp(output.sequences_scores.numpy()[0])
    return {'score': score, 'answer': decoded_output}

# Connect to elasticsearch

In [None]:
# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = YOUR_ELASTICSEARCH_PASSWORD
# es_path = "~/Documents/UWr/Chatbots/elasticsearch-8.4.3/"
es_path = "C:/Users/jakub/elasticsearch-8.5.3-windows-x86_64/elasticsearch-8.5.3/"

# Create the client instance
client = Elasticsearch(
    "https://localhost:9200",
    ca_certs=es_path+"config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

In [5]:
# Check if you can connect to ES (bool)
client.ping()

True

# Index documents from wikipedia paragraphs

In [12]:
DATASET_FILE = "../Data/fp_wiki.txt"

In [13]:
def generate_actions():
    with open(DATASET_FILE, "r", encoding="UTF-8") as file:
        # Read the first line of the file
        title_line = file.readline()
        # Create a variable to store the ID of the next document
        next_id = 0
        # Keep reading lines until the end of the file is reached
        while title_line:
            # Check if the line starts with "Title: "
            if title_line.startswith("TITLE: "):
                # Get the title by stripping the "Title: " prefix and the newline character at the end
                title = title_line.lstrip("TITLE:").strip()
                # Read the second line of the file, which should be the title again
                title_line = file.readline().strip()
                # Save title for later usage
                title = title_line
                # Create a list to store the lines of the article
                article_lines = []
                # Read the next line, which should be the start of the article
                article_line = file.readline()
                # Keep reading lines until an empty line is reached
                while article_line.strip():
                    # Add the line to the list of article lines
                    article_lines.append(article_line)
                    # Read the next line
                    article_line = file.readline()
                # Join the lines of the article with newline characters to create the article
                article = "\n".join(article_lines) if article_lines else ""
                # Create a dictionary for the document
                document = {"_id": next_id, "title": title, "article": article}
                # Yield new document
                yield document
                # Increment the ID for the next document
                next_id += 1
                # Read the next line, which should be the start of the next document
                title_line = file.readline()

In [15]:
index_name = "offline_competition"

In [16]:
configurations = {
    "settings": {
        "analysis": {
            "analyzer": {
                "lang_pl_morfologik": { 
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "morfologik_stem"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "long"},
            "article": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            },
            "title": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            }
        }
    }
}

In [17]:
client.options(ignore_status=[400,404]).indices.delete(index=index_name)

client.indices.create(
    index=index_name,
    settings=configurations["settings"],
    mappings=configurations["mappings"],
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'offline_competition'})

In [18]:
print("Indexing documents...")
number_of_docs=1209001
progress = tqdm(unit="docs", total=number_of_docs)
successes = 0
for ok, action in streaming_bulk(
    client=client, index=index_name, actions=generate_actions(),
):
    progress.update(1)
    successes += ok
print("Indexed %d/%d documents" % (successes, number_of_docs))

Indexing documents...


100%|███████████████████████████████████████████████████████████████████▉| 1207501/1209001 [04:36<00:00, 4717.47docs/s]

Indexed 1208362/1209001 documents


100%|███████████████████████████████████████████████████████████████████▉| 1208362/1209001 [04:50<00:00, 4717.47docs/s]

# Answer questions

In [75]:
def retriever(question, index_name="offline_competition", k=3):
    resp = client.search(index=index_name, 
                     query={'match': {
                         "article": question
                     }})
    best_documents = list(sorted(resp['hits']['hits'], key=lambda k: k['_score'], reverse=True))
    context = ""
    for document in best_documents[:k]:
        if document['_source']['title'].lower() not in document['_source']['article'].lower():
            context += document['_source']['title'] + " . "
        context += document['_source']['article'] + "\n"
    return context

## Herbert

In [None]:
k = 5

In [24]:
with open(f'HerbertAnswers/offline_competition_k_{k}.txt', 'a', encoding='UTF-8') as f:
    with open(f'HerbertAnswers/offline_competition_k_{k}_with_confidence.txt', 'a', encoding='UTF-8') as f_confidence:
        for question, _ in tqdm(question_answers):
            context = retriever(question, index_name="offline_competition", k=k)
            result = get_answer_herbert(context, question)
            predicted_answer = result['answer'].replace("\n", " ").replace("\t", " ").strip()
            confidence = result['score'] if predicted_answer != "" else 0.0
            f.write(predicted_answer + "\n")
            f_confidence.write(f"{predicted_answer}\t{confidence}\n")        

100%|████████████████████████████████████████████████████████████████████████████| 3500/3500 [3:04:14<00:00,  3.16s/it]


In [33]:
with open(f'HerbertAnswers/offline_competition_k_{k}.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [34]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [35]:
!python advent_answer_check.py

TOTAL SCORE: 0.21114285714285713


## PLT5

In [105]:
k = 5

In [106]:
with open(f'PLT5BaseAnswers/offline_competition_k_{k}.txt', 'a', encoding='UTF-8') as f:
    with open(f'PLT5BaseAnswers/offline_competition_k_{k}_with_confidence.txt', 'a', encoding='UTF-8') as f_confidence:
        for question, _ in tqdm(question_answers):
            context = retriever(question, index_name="offline_competition", k=k)
            result = get_answer_plt5(context, question)
            predicted_answer = result['answer'].replace("\n", " ").replace("\t", " ").strip()
            confidence = result['score'] if predicted_answer != "" else 0.0
            f.write(predicted_answer + "\n")
            f_confidence.write(f"{predicted_answer}\t{confidence}\n")        

100%|████████████████████████████████████████████████████████████████████████████| 3500/3500 [2:06:52<00:00,  2.17s/it]


In [107]:
with open(f'PLT5BaseAnswers/offline_competition_k_{k}.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [110]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [111]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [112]:
!python advent_answer_check.py

TOTAL SCORE: 0.3485714285714286


# Join Herbert and PLT5 answers

## Herbert > PLT5

Worse than pure PLT5

In [114]:
with open(f'PLT5BaseAnswers/offline_competition_k_5.txt', 'r', encoding='UTF-8') as f_plt5:
    with open(f'HerbertAnswers/offline_competition_k_3.txt', 'r', encoding='UTF-8') as f_herbert:
        with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_PLT5_if_no_Herbert_answer.txt', 'a',
                  encoding='UTF-8') as f_combined:
            for herbert_answer, plt5_answer in zip(f_herbert, f_plt5):
                herbert_answer = herbert_answer.strip()
                plt5_answer = plt5_answer.strip()
                if herbert_answer != "":
                    f_combined.write(herbert_answer + "\n")
                else:
                    f_combined.write(plt5_answer + "\n")

In [115]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_PLT5_if_no_Herbert_answer.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [116]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [117]:
!python advent_answer_check.py

TOTAL SCORE: 0.3048571428571429


## PLT5 > Herbert

Better than pure PLT5 and Herbert

In [119]:
with open(f'PLT5BaseAnswers/offline_competition_k_5.txt', 'r', encoding='UTF-8') as f_plt5:
    with open(f'HerbertAnswers/offline_competition_k_3.txt', 'r', encoding='UTF-8') as f_herbert:
        with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer.txt', 'a',
                  encoding='UTF-8') as f_combined:
            for herbert_answer, plt5_answer in zip(f_herbert, f_plt5):
                herbert_answer = herbert_answer.strip()
                plt5_answer = plt5_answer.strip()
                if plt5_answer != "":
                    f_combined.write(plt5_answer + "\n")
                else:
                    f_combined.write(herbert_answer + "\n")

#### Test A

In [124]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [125]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [126]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [127]:
!python advent_answer_check.py

TOTAL SCORE: 0.3534285714285714


#### Test A'

In [172]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        index = 0
        for line in f:
            if index % 5 == 0:
                f_answers.write(line)
            index += 1

In [173]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [174]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    index = 0
    for _, answers in question_answers:
        if index % 5 == 0:
            f.write("\t".join(answers) + "\n")
        index += 1

In [175]:
!python advent_answer_check.py

TOTAL SCORE: 0.3585714285714286


------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

------------------------------------------------------------

#  Modifications below didn't improve score of combined PLT5 and Herbert  answers!

## Handle special type of questions

In [128]:
questions = [question for question, _ in question_answers]

## Yes/No questions

In [134]:
def is_yes_no_question(question):
    return question[:4] == "Czy " and " czy " not in question[4:]

In [135]:
def get_sentence_prob(text):
    input_ids = torch.tensor(yes_no_tokenizer.encode(text)).unsqueeze(0)
    with torch.no_grad():
        outputs = yes_no_model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob

In [136]:
def answer_yes_no_question(question):
    yes_sentence_prob = get_sentence_prob(question + " Tak")
    no_sentence_prob = get_sentence_prob(question + " Nie")
    return "tak" if yes_sentence_prob > no_sentence_prob else "nie"

In [137]:
def is_yes_no_answer(answer):
    answer = answer.strip().lower()
    return answer == "tak" or answer == "nie"

In [142]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer.txt', 'r',
          encoding='UTF-8') as f:
    with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no.txt', 'w',
              encoding='UTF-8') as f_answers:
        raw_answers = []
        for line in f:
            raw_answers.append(line.strip())
        for question, raw_answer in zip(questions, raw_answers):
            if is_yes_no_question(question) and not is_yes_no_answer(raw_answer):
                f_answers.write(answer_yes_no_question(question) + "\n")
            else:
                f_answers.write(raw_answer + "\n")

#### Test A

In [143]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [15]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [144]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [145]:
!python advent_answer_check.py

TOTAL SCORE: 0.3534285714285714


#### Test A'

In [168]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        index = 0
        for line in f:
            if index % 5 == 0:
                f_answers.write(line)
            index += 1

In [169]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [170]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    index = 0
    for _, answers in question_answers:
        if index % 5 == 0:
            f.write("\t".join(answers) + "\n")
        index += 1

In [171]:
!python advent_answer_check.py

TOTAL SCORE: 0.3585714285714286


## Optional questions

In [152]:
def is_optional_question(question):
    to_pointer = question.find(" to ")
    if to_pointer == -1:
        return False
    czy_part = question[to_pointer:]
    czy_pointer = czy_part.find(" czy ")
    return czy_pointer != -1

In [153]:
def get_term_and_options(question):
    if (dot_pointer := question.find(".")) != -1:
        question = question[dot_pointer:]
    if question[-1] == "?":
        question = question[:-1]
    if question.lower()[:3] == 'czy':
        question = question[3:].strip()
    term, options = question.split(' to ')
    options = options.split(' czy ')
    first_options = options[0]
    first_options = first_options.split(",")
    first_options.extend(options[1:])
    return term.strip(), [opt.strip() for opt in first_options if opt.strip() != ""]

In [154]:
def answer_optional_question(question):
    term, options = get_term_and_options(question)
    options_embeddings = embedding_model.encode(options)
    term_embedding = embedding_model.encode(term)
    similarity = np.array([distance.cosine(term_embedding, emb) for emb in options_embeddings])
    return options[np.argmin(similarity)]

In [155]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no.txt', 'r',
          encoding='UTF-8') as f:
    with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no_optional.txt', 'w',
              encoding='UTF-8') as f_answers:
        raw_answers = []
        for line in f:
            raw_answers.append(line.strip())
        for question, raw_answer in zip(questions, raw_answers):
            if not is_yes_no_question(question) and is_optional_question(question):
                f_answers.write(answer_optional_question(question) + "\n")
            else:
                f_answers.write(raw_answer + "\n")

#### Test A

In [156]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no_optional.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [97]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [157]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [158]:
!python advent_answer_check.py

TOTAL SCORE: 0.35


#### Test A'

In [159]:
with open(f'HerbertAndPLT5Answers/offline_competition_answer_with_Herbert_if_no_PLT5_answer_yes_no_optional.txt', 'r',
          encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        index = 0
        for line in f:
            if index % 5 == 0:
                f_answers.write(line)
            index += 1

In [101]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [160]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    index = 0
    for _, answers in question_answers:
        if index % 5 == 0:
            f.write("\t".join(answers) + "\n")
        index += 1

In [161]:
!python advent_answer_check.py

TOTAL SCORE: 0.3557142857142857
