In [84]:
from elasticsearch import Elasticsearch
from tqdm import tqdm
from collections import defaultdict
from elasticsearch.helpers import streaming_bulk
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel

# Dataset

In [3]:
with open('../Data/task2_questions_with_answers.tsv', 'r', encoding='UTF-8') as f:
    question_answers = []
    for line in f:
        splitted = line.strip().split("\t")
        question_answers.append((splitted[0], splitted[1:]))

In [3]:
[question for question, answers in question_answers if "?" != question[-1]]

['Proszę rozwinąć skrót powstałego w 1918 roku urzędu GUS.',
 'Dokończ przysłowie: „pierwsze koty za...”',
 'Proszę podać polską nazwę organizacji gospodarczej o skrócie EFTA.',
 'Podaj wzór chemiczny lodu.',
 'Wskaż przyimek w zdaniu „Idę do szkoły”.',
 'Rozwiń skrót NFZ.',
 'Proszę dokończyć przysłowie: „nosił wilk razy kilka...”',
 'Dokończ: „siła złego na...”',
 'Proszę rozwinąć skrót IPN.',
 'Dokończ przysłowie: „Kazał pan...”',
 'Dokończ przysłowie: wszędzie dobrze, ale...',
 'Proszę podać imię i nazwisko aktora odtwarzającego rolę Lucjana Mostowiaka.',
 'Jak nazywa się okres sprawowania funkcji przez papieża? Słowo pochodzi z łaciny.',
 'Jak nazywa się nadzienie do pierogów albo naleśników? Słowo pochodzi z języka francuskiego.',
 'Rozwiń skrót „CBŚ”.',
 'Proszę podać dokładną datę radzieckiej agresji na Polskę w czasie II wojny światowej.',
 'Rozszyfruj skrót GOPR.',
 'Proszę podać nazwę jednej z najwyższych zapór wodnych świata na rzece Kolorado, niedaleko Las Vegas.',
 'Z któ

# Pretrained models

In [103]:
yes_no_model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')
yes_no_tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
# yes_no_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
# yes_no_model = AutoModelWithLMHead.from_pretrained("allegro/herbert-large-cased")
yes_no_model.eval()



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dro

In [6]:
qa_pipeline = pipeline(
    "question-answering",
    model='azwierzc/herbert-large-poquad', handle_impossible_answer=True,
    tokenizer='azwierzc/herbert-large-poquad'
)

def get_answer(context, question):
    return qa_pipeline({
        'context': context,
        'question': question})['answer'].replace("\n", " ").strip()

Downloading:   0%|          | 0.00/884 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/907k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/556k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.30M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/144 [00:00<?, ?B/s]

# Connect to elasticsearch

In [7]:
# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = FILL_IN_PASSWORD
# es_path = "~/Documents/UWr/Chatbots/elasticsearch-8.4.3/"
es_path = "C:/Users/jakub/elasticsearch-8.5.3-windows-x86_64/elasticsearch-8.5.3/"

# Create the client instance
client = Elasticsearch(
    "https://localhost:9200",
    ca_certs=es_path+"config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

client.info()
# Should return sth like {'name': 'instance-0000000000', 'cluster_name': ...}

ObjectApiResponse({'name': 'SZCZUPAK', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'jTU4rQqTShilMYzr6mVpvw', 'version': {'number': '8.5.3', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '4ed5ee9afac63de92ec98f404ccbed7d3ba9584e', 'build_date': '2022-12-05T18:22:22.226119656Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
# Check if you can connect to ES (bool)
client.ping()

True

# Index documents from wikipedia paragraphs

In [12]:
DATASET_FILE = "../Data/fp_wiki.txt"

In [13]:
def generate_actions():
    with open(DATASET_FILE, "r", encoding="UTF-8") as file:
        # Read the first line of the file
        title_line = file.readline()
        # Create a variable to store the ID of the next document
        next_id = 0
        # Keep reading lines until the end of the file is reached
        while title_line:
            # Check if the line starts with "Title: "
            if title_line.startswith("TITLE: "):
                # Get the title by stripping the "Title: " prefix and the newline character at the end
                title = title_line.lstrip("TITLE:").strip()
                # Read the second line of the file, which should be the title again
                title_line = file.readline().strip()
                # Save title for later usage
                title = title_line
                # Create a list to store the lines of the article
                article_lines = []
                # Read the next line, which should be the start of the article
                article_line = file.readline()
                # Keep reading lines until an empty line is reached
                while article_line.strip():
                    # Add the line to the list of article lines
                    article_lines.append(article_line)
                    # Read the next line
                    article_line = file.readline()
                # Join the lines of the article with newline characters to create the article
                article = "\n".join(article_lines) if article_lines else ""
                # Create a dictionary for the document
                document = {"_id": next_id, "title": title, "article": article}
                # Yield new document
                yield document
                # Increment the ID for the next document
                next_id += 1
                # Read the next line, which should be the start of the next document
                title_line = file.readline()

In [15]:
index_name = "offline_competition"

In [16]:
configurations = {
    "settings": {
        "analysis": {
            "analyzer": {
                "lang_pl_morfologik": { 
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "morfologik_stem"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "long"},
            "article": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            },
            "title": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            }
        }
    }
}

In [17]:
client.options(ignore_status=[400,404]).indices.delete(index=index_name)

client.indices.create(
    index=index_name,
    settings=configurations["settings"],
    mappings=configurations["mappings"],
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'offline_competition'})

In [18]:
print("Indexing documents...")
number_of_docs=1209001
progress = tqdm(unit="docs", total=number_of_docs)
successes = 0
for ok, action in streaming_bulk(
    client=client, index=index_name, actions=generate_actions(),
):
    progress.update(1)
    successes += ok
print("Indexed %d/%d documents" % (successes, number_of_docs))

Indexing documents...


100%|███████████████████████████████████████████████████████████████████▉| 1207501/1209001 [04:36<00:00, 4717.47docs/s]

Indexed 1208362/1209001 documents


100%|███████████████████████████████████████████████████████████████████▉| 1208362/1209001 [04:50<00:00, 4717.47docs/s]

# Answer questions

In [21]:
def retriever(question, index_name="offline_competition", k=3):
    resp = client.search(index=index_name, 
                     query={'match': {
                         "article": question
                     }})
    best_documents = list(sorted(resp['hits']['hits'], key=lambda k: k['_score'], reverse=True))
    context = ""
    for document in best_documents[:k]:
        if document['_source']['title'].lower() not in document['_source']['article'].lower():
            context += document['_source']['title'] + " . "
        context += document['_source']['article'] + "\n"
    return context

In [24]:
with open('HerbertAnswers/offline_competition_k_3.txt', 'a', encoding='UTF-8') as f:
    for question, _ in tqdm(question_answers):
        context = retriever(question, index_name="offline_competition", k=3)
        predicted_answer = get_answer(context, question)
        f.write(predicted_answer.replace("\n", " ").strip() + "\n")

100%|████████████████████████████████████████████████████████████████████████████| 3500/3500 [3:04:14<00:00,  3.16s/it]


In [33]:
with open('HerbertAnswers/offline_competition_k_3.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [34]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [35]:
!python advent_answer_check.py

TOTAL SCORE: 0.21114285714285713


In [36]:
import editdistance
import sys

rn = ['ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii',
                 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii']

rome_numbers = dict(zip(rn, range(2, 23)))

def numbers_from(s):
    res = set()
    for w in s.split():
        w = w.lower()
        if w.isdecimal():
            res.add(w)
        if w in rome_numbers:
            res.add(rome_numbers[w])
    return res                 
                 
                 
def is_number(s):
    return lower(s) in rome_numbers or s.isdecimal()
                     
def scaled_editdist(ans, cor):
    ans = ans.lower()
    cor = cor.lower()
    
    return editdistance.eval(ans, cor) / len(cor)
    
def single_match(a, c):
    numbers_c = numbers_from(c)
    numbers_a = numbers_from(a)
        
    return numbers_a == numbers_c and scaled_editdist(a, c) < 0.5
        
def match(ans, cor):
    return any(single_match(ans, c) for c in cor)
        
found_answers = []
correct_answers = []

for x in open('correct_answers.txt', encoding="UTF-8"):
    x = x.strip()
    correct_answers.append(x.lower().split('\t'))
    
for x in open('found_answers.txt', encoding="UTF-8"):    
    x = x.strip()
    found_answers.append(x.lower())
    
N = len(correct_answers)
score = 0.0

for ans, cor in zip(found_answers, correct_answers):    
    if match(ans, cor):
        score += 1
        
print ('TOTAL SCORE:', score / len(correct_answers))        


TOTAL SCORE: 0.21114285714285713


# Handle special type of questions

In [38]:
questions = [question for question, _ in question_answers]

## Yes/No questions

In [104]:
def is_yes_no_question(question):
    return question[:4] == "Czy " and " czy " not in question[4:]

In [105]:
def get_sentence_prob(text):
    input_ids = torch.tensor(yes_no_tokenizer.encode(text)).unsqueeze(0)
    with torch.no_grad():
        outputs = yes_no_model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob

In [106]:
def answer_yes_no_question(question):
    yes_sentence_prob = get_sentence_prob(question + " Tak")
    no_sentence_prob = get_sentence_prob(question + " Nie")
    return "tak" if yes_sentence_prob > no_sentence_prob else "nie"

In [109]:
with open('HerbertAnswers/offline_competition_k_3.txt', 'r', encoding='UTF-8') as f:
    with open('HerbertAnswers/offline_competition_k_3_yes_no.txt', 'w', encoding='UTF-8') as f_answers:
        raw_answers = []
        for line in f:
            raw_answers.append(line.strip())
        for question, raw_answer in zip(questions, raw_answers):
            if is_yes_no_question(question):
                f_answers.write(answer_yes_no_question(question) + "\n")
            else:
                f_answers.write(raw_answer + "\n")

### Test A

In [14]:
with open('HerbertAnswers/offline_competition_k_3_yes_no.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [15]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [11]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [6]:
!python advent_answer_check.py

TOTAL SCORE: 0.2642857142857143


### Test A'

In [16]:
with open('HerbertAnswers/offline_competition_k_3_yes_no.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        k = 0
        for line in f:
            if k % 5 == 0:
                f_answers.write(line)
            k += 1

In [17]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [18]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    k = 0
    for _, answers in question_answers:
        if k % 5 == 0:
            f.write("\t".join(answers) + "\n")
        k += 1

In [9]:
!python advent_answer_check.py

TOTAL SCORE: 0.2814285714285714


## Optional questions