In [149]:
from elasticsearch import Elasticsearch
from tqdm import tqdm
from collections import defaultdict
from elasticsearch.helpers import streaming_bulk
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModel
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance

# Dataset

In [2]:
with open('../Data/test_B_questions.txt', 'r', encoding='UTF-8') as f:
    questions = []
    for line in f:
        questions.append(line.strip())
        
with open('../Data/test_B_answers.txt', 'r', encoding='UTF-8') as f:
    answers = []
    for line in f:
        splitted = line.strip().split("\t")
        answers.append(splitted)

question_answers = list(zip(questions, answers))

In [3]:
[question for question, answers in question_answers if "?" != question[-1]]

['Proszę podać datę zamachu na WTC.',
 'Czy interna to szpitalny oddział chorób zakaźnych',
 'Proszę podać nazwiska skłóconych rodów, z których pochodzili Romeo i Julia.',
 'Dokończ powiedzenie: „Ten się śmieje, kto..."',
 'Proszę podać rok wprowadzenia w Polsce stanu wojennego.',
 'Z którego kontynentu pochodzi język suahili',
 'Dokończ przysłowie: „i wilk syty, i ...”',
 'Podaj polski tytuł filmu z serii o Jamesie Bondzie, w którym wystąpiła Madonna.',
 'Podaj dwa imiona Roosevelta, 32. prezydenta Stanów Zjednoczonych.',
 'Który przypadek odpowiada na pytania: „o kim? o czym?”',
 'Proszę podać oba imiona Bacha.',
 'Proszę podać nazwiska dwóch amerykańskich aktorów, którzy zagrali główne męskie role w filmie „Żądło”.',
 'Wielka i Mała Rawka to szczyty Bieszczadów czy Gór Świętokrzyskich',
 'W którym mieście urodził się Mahomet',
 'Proszę dokończyć przysłowie: „Przyganiał kocioł garnkowi...”',
 'Warwick to największy średniowieczny zamek na Wyspach Brytyjskich leżący w Anglii, Szkocji 

# Pretrained models

In [160]:
yes_no_model = AutoModelWithLMHead.from_pretrained('flax-community/papuGaPT2')
yes_no_tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
# yes_no_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
# yes_no_model = AutoModelWithLMHead.from_pretrained("allegro/herbert-large-cased")
_ = yes_no_model.eval()

In [None]:
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [5]:
qa_pipeline = pipeline(
    "question-answering",
    model='azwierzc/herbert-large-poquad', handle_impossible_answer=True,
    tokenizer='azwierzc/herbert-large-poquad'
)

def get_answer(context, question):
    return qa_pipeline({
        'context': context,
        'question': question})['answer'].replace("\n", " ").strip()

# Connect to elasticsearch

In [173]:
# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = FILL_IN_PASSWORD
# es_path = "~/Documents/UWr/Chatbots/elasticsearch-8.4.3/"
es_path = "C:/Users/jakub/elasticsearch-8.5.3-windows-x86_64/elasticsearch-8.5.3/"

# Create the client instance
client = Elasticsearch(
    "https://localhost:9200",
    ca_certs=es_path+"config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

In [174]:
# Check if you can connect to ES (bool)
client.ping()

True

# Index documents from wikipedia paragraphs

In [12]:
DATASET_FILE = "../Data/fp_wiki.txt"

In [13]:
def generate_actions():
    with open(DATASET_FILE, "r", encoding="UTF-8") as file:
        # Read the first line of the file
        title_line = file.readline()
        # Create a variable to store the ID of the next document
        next_id = 0
        # Keep reading lines until the end of the file is reached
        while title_line:
            # Check if the line starts with "Title: "
            if title_line.startswith("TITLE: "):
                # Get the title by stripping the "Title: " prefix and the newline character at the end
                title = title_line.lstrip("TITLE:").strip()
                # Read the second line of the file, which should be the title again
                title_line = file.readline().strip()
                # Save title for later usage
                title = title_line
                # Create a list to store the lines of the article
                article_lines = []
                # Read the next line, which should be the start of the article
                article_line = file.readline()
                # Keep reading lines until an empty line is reached
                while article_line.strip():
                    # Add the line to the list of article lines
                    article_lines.append(article_line)
                    # Read the next line
                    article_line = file.readline()
                # Join the lines of the article with newline characters to create the article
                article = "\n".join(article_lines) if article_lines else ""
                # Create a dictionary for the document
                document = {"_id": next_id, "title": title, "article": article}
                # Yield new document
                yield document
                # Increment the ID for the next document
                next_id += 1
                # Read the next line, which should be the start of the next document
                title_line = file.readline()

In [15]:
index_name = "offline_competition"

In [16]:
configurations = {
    "settings": {
        "analysis": {
            "analyzer": {
                "lang_pl_morfologik": { 
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "morfologik_stem"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "long"},
            "article": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            },
            "title": {
                "type": "text",
                "analyzer": "lang_pl_morfologik"
            }
        }
    }
}

In [17]:
client.options(ignore_status=[400,404]).indices.delete(index=index_name)

client.indices.create(
    index=index_name,
    settings=configurations["settings"],
    mappings=configurations["mappings"],
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'offline_competition'})

In [18]:
print("Indexing documents...")
number_of_docs=1209001
progress = tqdm(unit="docs", total=number_of_docs)
successes = 0
for ok, action in streaming_bulk(
    client=client, index=index_name, actions=generate_actions(),
):
    progress.update(1)
    successes += ok
print("Indexed %d/%d documents" % (successes, number_of_docs))

Indexing documents...


100%|███████████████████████████████████████████████████████████████████▉| 1207501/1209001 [04:36<00:00, 4717.47docs/s]

Indexed 1208362/1209001 documents


100%|███████████████████████████████████████████████████████████████████▉| 1208362/1209001 [04:50<00:00, 4717.47docs/s]

# Answer questions

In [9]:
def retriever(question, index_name="offline_competition", k=3):
    resp = client.search(index=index_name, 
                     query={'match': {
                         "article": question
                     }})
    best_documents = list(sorted(resp['hits']['hits'], key=lambda k: k['_score'], reverse=True))
    context = ""
    for document in best_documents[:k]:
        if document['_source']['title'].lower() not in document['_source']['article'].lower():
            context += document['_source']['title'] + " . "
        context += document['_source']['article'] + "\n"
    return context

In [10]:
with open('HerbertAnswers/test_B_offline_competition_k_3.txt', 'a', encoding='UTF-8') as f:
    for question, _ in tqdm(question_answers):
        context = retriever(question, index_name="offline_competition", k=3)
        predicted_answer = get_answer(context, question)
        f.write(predicted_answer.replace("\n", " ").strip() + "\n")

100%|██████████████████████████████████████████████████████████████████████████████| 2500/2500 [39:38<00:00,  1.05it/s]


In [11]:
with open('HerbertAnswers/test_B_offline_competition_k_3.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [12]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [13]:
!python advent_answer_check.py

TOTAL SCORE: 0.2384


# Handle special type of questions

In [93]:
questions = [question for question, _ in question_answers]

## Yes/No questions

In [17]:
def is_yes_no_question(question):
    return question[:4] == "Czy " and " czy " not in question[4:]

In [18]:
def get_sentence_prob(text):
    input_ids = torch.tensor(yes_no_tokenizer.encode(text)).unsqueeze(0)
    with torch.no_grad():
        outputs = yes_no_model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob

In [89]:
def answer_yes_no_question(question):
    yes_sentence_prob = get_sentence_prob(question + " Tak")
    no_sentence_prob = get_sentence_prob(question + " Nie")
    return "tak" if yes_sentence_prob > no_sentence_prob else "nie"

In [90]:
with open('HerbertAnswers/test_B_offline_competition_k_3.txt', 'r', encoding='UTF-8') as f:
    with open('HerbertAnswers/test_B_offline_competition_k_3_yes_no.txt', 'w', encoding='UTF-8') as f_answers:
        raw_answers = []
        for line in f:
            raw_answers.append(line.strip())
        for question, raw_answer in zip(questions, raw_answers):
            if is_yes_no_question(question):
                f_answers.write(answer_yes_no_question(question) + "\n")
            else:
                f_answers.write(raw_answer + "\n")

### Test B

In [97]:
with open('HerbertAnswers/test_B_offline_competition_k_3_yes_no.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [98]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestB_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [99]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [100]:
!python advent_answer_check.py

TOTAL SCORE: 0.2764


## Optional questions

In [111]:
def is_optional_question(question):
    to_pointer = question.find(" to ")
    if to_pointer == -1:
        return False
    czy_part = question[to_pointer:]
    czy_pointer = czy_part.find(" czy ")
    return czy_pointer != -1

In [140]:
def get_term_and_options(question):
    if (dot_pointer := question.find(".")) != -1:
        question = question[dot_pointer:]
    if question[-1] == "?":
        question = question[:-1]
    if question.lower()[:3] == 'czy':
        question = question[3:].strip()
    term, options = question.split(' to ')
    options = options.split(' czy ')
    first_options = options[0]
    first_options = first_options.split(",")
    first_options.extend(options[1:])
    return term.strip(), [opt.strip() for opt in first_options if opt.strip() != ""]

In [152]:
def answer_optional_question(question):
    term, options = get_term_and_options(question)
    options_embeddings = embedding_model.encode(options)
    term_embedding = embedding_model.encode(term)
    similarity = np.array([distance.cosine(term_embedding, emb) for emb in options_embeddings])
    return options[np.argmin(similarity)]

In [166]:
with open('HerbertAnswers/test_B_offline_competition_k_3_yes_no.txt', 'r', encoding='UTF-8') as f:
    with open('HerbertAnswers/test_B_offline_competition_k_3_yes_no_optional.txt', 'w', encoding='UTF-8') as f_answers:
        raw_answers = []
        for line in f:
            raw_answers.append(line.strip())
        for question, raw_answer in zip(questions, raw_answers):
            if not is_yes_no_question(question) and is_optional_question(question):
                f_answers.write(answer_optional_question(question) + "\n")
            else:
                f_answers.write(raw_answer + "\n")

### Test B

In [167]:
with open('HerbertAnswers/test_B_offline_competition_k_3_yes_no_optional.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [168]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestB_Offline.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [169]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [170]:
!python advent_answer_check.py

TOTAL SCORE: 0.2884
