In [1]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from collections import defaultdict
from elasticsearch.helpers import streaming_bulk
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration

# Dataset

In [2]:
with open('../Data/task2_questions_with_answers.tsv', 'r', encoding='UTF-8') as f:
    question_answers = []
    for line in f:
        splitted = line.strip().split("\t")
        question_answers.append((splitted[0], splitted[1:]))

In [3]:
questions = [qa[0] for qa in question_answers]

# Pretrained models

In [2]:
yes_no_model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')
yes_no_tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
# yes_no_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
# yes_no_model = AutoModelWithLMHead.from_pretrained("allegro/herbert-large-cased")
_ = yes_no_model.eval()



In [7]:
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [4]:
qa_pipeline_herbert = pipeline(
    "question-answering",
    model="C:/Users/jakub/.cache/huggingface/hub/models--azwierzc--herbert-large-poquad/snapshots/5d064fc025bb974f06b830016dc98ac130dc4ad0", 
    handle_impossible_answer=True,
    tokenizer="C:/Users/jakub/.cache/huggingface/hub/models--azwierzc--herbert-large-poquad/snapshots/5d064fc025bb974f06b830016dc98ac130dc4ad0"
)

def get_answer_herbert(context, question):
    return qa_pipeline_herbert({
        'context': context,
        'question': question})

In [4]:
# # PLT5 Large is about 3x slower than PLT5 Base
# plt5_tokenizer = AutoTokenizer.from_pretrained("/home/karol/.cache/huggingface/hub/models--azwierzc--plt5-large-poquad/snapshots/43f9f31f22f446037f537b0d03f1e080c44f0438")
# plt5_model = T5ForConditionalGeneration.from_pretrained("/home/karol/.cache/huggingface/hub/models--azwierzc--plt5-large-poquad/snapshots/43f9f31f22f446037f537b0d03f1e080c44f0438")
# _ = plt5_model.eval()
plt5_tokenizer = AutoTokenizer.from_pretrained("C:/Users/jakub/.cache/huggingface/hub/models--azwierzc--plt5-base-poquad/snapshots/e7046f4bd71217af7b189288651f9e3b25cd4e21")
plt5_model = T5ForConditionalGeneration.from_pretrained("C:/Users/jakub/.cache/huggingface/hub/models--azwierzc--plt5-base-poquad/snapshots/e7046f4bd71217af7b189288651f9e3b25cd4e21")
_ = plt5_model.eval()

In [5]:
def get_answer_plt5(context, question):
    query = plt5_tokenizer(f"question: {question}, context: {context}", 
                           max_length=512, truncation=True, return_tensors="pt")
    output = plt5_model.generate(**query, max_new_tokens=8, num_beams=5, return_dict_in_generate=True, output_scores=True)
    decoded_output = plt5_tokenizer.decode(output.sequences[0], skip_special_tokens=True)
    score = np.exp(output.sequences_scores.numpy()[0])
    return {'score': score, 'answer': decoded_output}

In [6]:
ner_classifier = pipeline(
    "ner",
    model='clarin-pl/FastPDN', 
    tokenizer='clarin-pl/FastPDN',
)

# Connect to elasticsearch

In [7]:
# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = "1OHbcE+y*2c_JF0qpPKd"
# es_path = "~/Documents/uwr/bots/elasticsearch-8.6.1/"
es_path = "C:/Users/jakub/elasticsearch-8.5.3-windows-x86_64/elasticsearch-8.5.3/"

# Create the client instance
client = Elasticsearch(
    "https://localhost:9200",
    ca_certs=es_path+"config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

In [8]:
# Check if you can connect to ES (bool)
client.ping()

True

# Index documents from wikipedia paragraphs

In [9]:
DATASET_FILE = "../Data/fp_wiki.txt"

In [10]:
def generate_actions():
    with open(DATASET_FILE, "r", encoding="UTF-8") as file:
        # Read the first line of the file
        title_line = file.readline()
        # Create a variable to store the ID of the next document
        next_id = 0
        # Keep reading lines until the end of the file is reached
        while title_line:
            # Check if the line starts with "Title: "
            if title_line.startswith("TITLE: "):
                # Get the title by stripping the "Title: " prefix and the newline character at the end
                title = title_line.lstrip("TITLE:").strip()
                # Read the second line of the file, which should be the title again
                title_line = file.readline().strip()
                # Save title for later usage
                title = title_line
                # Create a list to store the lines of the article
                article_lines = []
                # Read the next line, which should be the start of the article
                article_line = file.readline()
                # Keep reading lines until an empty line is reached
                while article_line.strip():
                    # Add the line to the list of article lines
                    article_lines.append(article_line)
                    # Read the next line
                    article_line = file.readline()
                # Join the lines of the article with newline characters to create the article
                article = "\n".join(article_lines) if article_lines else ""
                # Create a dictionary for the document
                document = {"_id": next_id, "title": title, "article": article}
                # Yield new document
                yield document
                # Increment the ID for the next document
                next_id += 1
                # Read the next line, which should be the start of the next document
                title_line = file.readline()

In [11]:
index_name = "offline_competition"

In [12]:
configurations = {
    "settings": {
        "analysis": {
            "analyzer": {
                "lang_pl_morfologik": { 
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "morfologik_stem"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "long"},
            "article": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            },
            "title": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            }
        }
    }
}

In [13]:
client.options(ignore_status=[400,404]).indices.delete(index=index_name)

client.indices.create(
    index=index_name,
    settings=configurations["settings"],
    mappings=configurations["mappings"],
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'offline_competition'})

In [14]:
print("Indexing documents...")
number_of_docs=1209001
progress = tqdm(unit="docs", total=number_of_docs)
successes = 0
for ok, action in streaming_bulk(
    client=client, index=index_name, actions=generate_actions(),
):
    progress.update(1)
    successes += ok
print("Indexed %d/%d documents" % (successes, number_of_docs))

Indexing documents...


  0%|          | 0/1209001 [00:00<?, ?docs/s]

Indexed 1208362/1209001 documents


# Answer questions

In [9]:
def retriever(question, index_name="offline_competition", k=3):
#     q = {'match': {"article": question}}
    q = {
        "combined_fields": {
            "query" : question,
            "fields" : ["title", "article"],
        }
    }
    resp = client.search(index=index_name, 
                     query=q)
    best_documents = list(sorted(resp['hits']['hits'], key=lambda k: k['_score'], reverse=True))
    context = ""
    for document in best_documents[:k]:
        if document['_source']['title'].lower() not in document['_source']['article'].lower():
            context += document['_source']['title'] + " . "
        context += document['_source']['article'] + "\n"
    return context

In [25]:
def retriever_ner(question, ners, index_name="offline_competition", k=3):
    q = {"bool" : {
            "should": {"match": {"article" : question}},
#             "should": [{"match": {"article" : question}},
#                        {"match": {"title" : {"query": question, "boost": 2}}}
#             ],
#             "should": {
#                 "combined_fields" : {
#                     "query" : question,
#                     "fields" : [ "title^2", "article" ]
#                     }},
            "must": {
                "bool": {
                    "should": [
                        {"multi_match" : {
                            "query" : ner,
                            "fields" : ["title^2", "article"],
                            "type": "phrase"
                        }} for ner in ners
                    ], 
                    "minimum_should_match" : "1<50%"
                }
            }
#             "must": {
#                 "bool": {
#                     "should": [{"match_phrase": {"article": ner}} for ner in ners].extend(
#                         [{"match_phrase": {"title": ner}} for ner in ners]), 
#                     "minimum_should_match" : "1<50%"}
#             }
        }
    }
    resp = client.search(index=index_name, query=q)
    best_documents = list(sorted(resp['hits']['hits'], key=lambda k: k['_score'], reverse=True))
#     return best_documents
    context = ""
    for document in best_documents[:k]:
        if document['_source']['title'].lower() not in document['_source']['article'].lower():
            context += document['_source']['title'] + " . "
        context += document['_source']['article'] + "\n"
    return context

In [139]:
question = "Kto był najlepszym przyjacielem Achillesa i uczestnikiem wojny trojańskiej; zginął z rąk Hektora?"

In [140]:
ners, _ = get_ners(question, k=1)
retriever_ner(question, ners)

"Hektor ( syn Priama ) . Hektor ( gr . Hektōr , łac . Hector ) – w mitologii greckiej królewicz i najdzielniejszy bohater trojański ; bohater `` Iliady '' Homera .\n\nUchodził za syna Priama i Hekabe ( Hekuba ) . Był bratem Parysa , Deifoba i Kasandry oraz mężem Andromachy .\n\nPoczątkowo był przeciwnikiem wybuchu wojny o Helenę .\n\nW czasie wojny trojańskiej zabił Patroklosa ( myśląc że to Achilles , gdyż miał na sobie jego zbroję ) , za co w odwecie zginął z ręki Achillesa pod murami Troi . Nawoływania rodziny Hektora do jego powrotu do twierdzy opisuje `` Lament Andromachy '' autorstwa Homera .\n\nPatroklos , Patrokles ( gr . `` Pátroklos '' , `` Patroklḗs '' , łac . `` Patroclus '' ) – w mitologii greckiej syn Menojtiosa , ukochany przyjaciel Achillesa . Patroklos zamieszkał na dworze ojca Achillesa , Peleusa , po tym jak zmuszony został do ucieczki z rodzinnego domu z powodu zabójstwa , którego dopuścił się w gniewie podczas gry w kości .\n\nU boku Achillesa walczył w wojnie troj

In [148]:
q = {
    "combined_fields": {
        "query" : question,
        "fields" : ["title", "article"],
    }
}
resp = client.search(index="offline_competition", 
                     query=q)
resp['hits']['hits']

[{'_index': 'offline_competition',
  '_id': '40272',
  '_score': 57.786007,
  '_source': {'title': 'Patroklos',
   'article': "Patroklos , Patrokles ( gr . `` Pátroklos '' , `` Patroklḗs '' , łac . `` Patroclus '' ) – w mitologii greckiej syn Menojtiosa , ukochany przyjaciel Achillesa . Patroklos zamieszkał na dworze ojca Achillesa , Peleusa , po tym jak zmuszony został do ucieczki z rodzinnego domu z powodu zabójstwa , którego dopuścił się w gniewie podczas gry w kości .\n\nU boku Achillesa walczył w wojnie trojańskiej . Zginął w natarciu na Troję z ręki Hektora , gdy nosił zbroję Achillesa , który wycofał się z walki . Pragnąc pomścić jego śmierć , Achilles postanowił zabić Hektora w pojedynku .\n"}},
 {'_index': 'offline_competition',
  '_id': '26379',
  '_score': 56.59058,
  '_source': {'title': 'Hektor ( syn Priama )',
   'article': "Hektor ( gr . Hektōr , łac . Hector ) – w mitologii greckiej królewicz i najdzielniejszy bohater trojański ; bohater `` Iliady '' Homera .\n\nUchodzi

In [11]:
def get_ners(question, k=1, verbose=False):
    classfication = ner_classifier(question)
    if verbose:
        print(*classfication, sep="\n")
    if len(classfication) == 0:
        return question, False
    entities = []
    word = ""
    end_place = classfication[0]["start"]
    for token in classfication:
        if token["start"] != end_place and token["entity"][0] == "B":
            entities.append(word.strip())
            word = ""
        word += token["word"].replace("</w>", " ")
        end_place = token["end"]
    entities.append(word.strip())
    if len(entities) >= k:
        return entities, True
    else:
        return question, False

In [22]:
get_ners("Kto napisał „Błagalnice” i „Persów”?", verbose=True)

{'entity': 'B-nam_pro_title', 'score': 0.9894637, 'index': 4, 'word': 'Bła', 'start': 13, 'end': 16}
{'entity': 'B-nam_pro_title', 'score': 0.985679, 'index': 5, 'word': 'gal', 'start': 16, 'end': 19}
{'entity': 'B-nam_pro_title', 'score': 0.9874556, 'index': 6, 'word': 'nice</w>', 'start': 19, 'end': 23}
{'entity': 'B-nam_pro_title', 'score': 0.99123126, 'index': 10, 'word': 'Per', 'start': 28, 'end': 31}
{'entity': 'B-nam_pro_title', 'score': 0.9807936, 'index': 11, 'word': 'sów</w>', 'start': 31, 'end': 34}


(['Błagalnice', 'Persów'], True)

In [23]:
get_ners("Czy Ulrich von jungingen został oskalpowany w Malborku?", verbose=True)

{'entity': 'B-nam_liv_person', 'score': 0.99849844, 'index': 2, 'word': 'Ul', 'start': 4, 'end': 6}
{'entity': 'B-nam_liv_person', 'score': 0.57404584, 'index': 3, 'word': 'rich</w>', 'start': 6, 'end': 10}
{'entity': 'I-nam_liv_person', 'score': 0.999582, 'index': 4, 'word': 'von</w>', 'start': 11, 'end': 14}
{'entity': 'I-nam_liv_person', 'score': 0.99952126, 'index': 5, 'word': 'jun', 'start': 15, 'end': 18}
{'entity': 'I-nam_liv_person', 'score': 0.9994435, 'index': 6, 'word': 'gin', 'start': 18, 'end': 21}
{'entity': 'I-nam_liv_person', 'score': 0.9992925, 'index': 7, 'word': 'gen</w>', 'start': 21, 'end': 24}
{'entity': 'B-nam_loc_gpe_city', 'score': 0.99921834, 'index': 14, 'word': 'Malborku</w>', 'start': 46, 'end': 54}


(['Ulrich von jungingen', 'Malborku'], True)

## Herbert

In [None]:
k = 5

In [24]:
with open(f'HerbertAnswers/offline_competition_k_{k}.txt', 'a', encoding='UTF-8') as f:
    with open(f'HerbertAnswers/offline_competition_k_{k}_with_confidence.txt', 'a', encoding='UTF-8') as f_confidence:
        for question, _ in tqdm(question_answers):
            context = retriever(question, index_name="offline_competition", k=k)
            result = get_answer_herbert(context, question)
            predicted_answer = result['answer'].replace("\n", " ").replace("\t", " ").strip()
            confidence = result['score'] if predicted_answer != "" else 0.0
            f.write(predicted_answer + "\n")
            f_confidence.write(f"{predicted_answer}\t{confidence}\n")        

100%|████████████████████████████████████████████████████████████████████████████| 3500/3500 [3:04:14<00:00,  3.16s/it]


In [33]:
with open(f'HerbertAnswers/offline_competition_k_{k}.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [34]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [35]:
!python advent_answer_check.py

TOTAL SCORE: 0.21114285714285713


## PLT5

In [12]:
k = 5

In [26]:
with open(f'PLT5BaseAnswers/offline_competition_k_{k}_ners_Aprime3.txt', 'a', encoding='UTF-8') as f:
    with open(f'PLT5BaseAnswers/offline_competition_k_{k}_ners_with_confidence_Aprime3.txt', 'a', encoding='UTF-8') as f_confidence:
        i = 0
        for question, _ in tqdm(question_answers):
            if i % 5 == 0:
                ners, ner_successful = get_ners(question)
                if ner_successful:
                    context = retriever_ner(question, ners, index_name="offline_competition", k=k)
                else:
                    context = retriever(question, index_name="offline_competition", k=k)
                result = get_answer_plt5(context, question)
                predicted_answer = result['answer'].replace("\n", " ").replace("\t", " ").strip()
                confidence = result['score'] if predicted_answer != "" else 0.0
                f.write(predicted_answer + "\n")
                f_confidence.write(f"{predicted_answer}\t{confidence}\n")
            i += 1

  0%|          | 0/3500 [00:00<?, ?it/s]

In [27]:
with open(f'PLT5BaseAnswers/offline_competition_k_{k}_ners_Aprime3.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [110]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [22]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    i = 0
    for _, answers in question_answers:
        if i % 5 == 0:
            f.write("\t".join(answers) + "\n")
        i += 1

In [16]:
# NER1
!python advent_answer_check.py

TOTAL SCORE: 0.37714285714285717


In [23]:
# NER2
!python advent_answer_check.py

TOTAL SCORE: 0.3657142857142857


In [28]:
# NER3
!python advent_answer_check.py

TOTAL SCORE: 0.36142857142857143


In [112]:
# No NERs
!python advent_answer_check.py

TOTAL SCORE: 0.3485714285714286
