# Ideas

Translations of answers:
- only answers
- question + answer, then remove question

# TODO

- Add DeepL API key
- Translate questions to english through DeepL API, save them on disk
- Get answers through getting closes supporting questions and asking BLOOM API, postprocess and save on disk
- Translate answers to polish
- Evaluate

In [1]:
import json
import ijson
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from transformers import pipeline, set_seed
from random import sample
import requests
import time
from json import JSONDecodeError
import editdistance
import sys
import random
import deepl

In [2]:
auth_key = YOUR_KEY  # Replace with your key
translator = deepl.Translator(auth_key)
usage = translator.get_usage()
if usage.any_limit_reached:
    print('Translation limit reached.')
if usage.character.valid:
    print(f"Character usage: {usage.character.count} of {usage.character.limit}")
if usage.document.valid:
    print(f"Document usage: {usage.document.count} of {usage.document.limit}")

Character usage: 362655 of 500000


# Create translations with DeepL

In [27]:
with open('../Data/task2_questions_with_answers.tsv', 'r', encoding='UTF-8') as f:
    question_answers = []
    for line in f:
        splitted = line.strip().split("\t")
        question_answers.append((splitted[0], splitted[1:]))

In [37]:
with open('../Data/task2_questions_translated.txt', 'a', encoding='UTF-8') as f:
    for question, _ in tqdm(question_answers):
        result = translator.translate_text(question, target_lang="EN-US", source_lang="PL")
        f.write(result.text.strip() + "\n") 

# Load data

In [3]:
with open('../Data/triviaqa-unfiltered/unfiltered-web-dev.json', 'r', encoding='UTF-8') as f:
    objects = ijson.items(f, "Data.item")
    dataset1 = [(o["Question"], o["Answer"]["Value"]) for o in objects]
    del objects

with open('../Data/triviaqa-unfiltered/unfiltered-web-train.json', 'r', encoding='UTF-8') as f:
    objects = ijson.items(f, "Data.item")
    dataset2 = [(o["Question"], o["Answer"]["Value"]) for o in objects]
    del objects

In [10]:
with open('../Data/task2_questions_with_answers.tsv', 'r', encoding='UTF-8') as f:
    question_answers = []
    for line in f:
        splitted = line.strip().split("\t")
        question_answers.append((splitted[0], splitted[1:]))

In [7]:
with open('../Data/task2_questions_translated.txt', 'r', encoding='UTF-8') as f:
    question_eng = []
    for line in f:
        question_eng.append(line.strip())

# Embed triviaqa questions

In [8]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

questions = [q for q,a in dataset2]

question_embeddings = []

N = len(questions)

batch_size = 64

for i in tqdm(range(N // batch_size + 1)):
    batch = questions[i * batch_size : min((i+1) * batch_size, N)]
    question_embeddings.extend(model.encode(batch))
    
question_embeddings = np.array(question_embeddings)
# 1370/1370 [10:40<00:00,  2.14it/s]

100%|████████████████████████████████████████████████████████████████████████████| 1370/1370 [1:02:30<00:00,  2.74s/it]


In [9]:
question_embeddings.shape

(87622, 768)

In [10]:
neigh = NearestNeighbors(n_neighbors=25)
neigh.fit(question_embeddings)

# Ask BLOOM with 25 supporting questions-answers

In [11]:
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
API_TOKEN = YOUR_TOKEN
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [12]:
with open('BLOOMAnswers/question_and_answer_raw.txt', 'a', encoding='UTF-8') as f:
    for question in tqdm(question_eng):
        embedding = model.encode([question])
        distances, indices = neigh.kneighbors(embedding, n_neighbors=25)

        chosen = [dataset2[i] for i in indices[0]]

        text = ""

        for q, a in chosen:
            support = "Question: " + q.strip() + " Answer: " + a.strip().capitalize() + ". "
            text += support
        text += "Question: " + question.strip() + " Answer:"
        
        successful = False
        
        while not successful:
            successful = True
            try:
                results = query({
                    "inputs": text,
                    "parameters": {"max_new_tokens": 4,
                                  "num_return_sequences": 1,
                                  "do_sample": False}
                })
                res = results[0]
                predicted_answer = res['generated_text']
                predicted_answer = predicted_answer.strip()
                if predicted_answer[-8:] == "Question":
                    predicted_answer = predicted_answer[:-8]
                elif predicted_answer[-9:] == "Question:":
                    predicted_answer = predicted_answer[:-9]
                predicted_answer = predicted_answer.strip()
                if predicted_answer[-1] == ".":
                    predicted_answer = predicted_answer[:-1]
                predicted_answer = predicted_answer[predicted_answer.rfind("Question:"):]
            except JSONDecodeError:
                predicted_answer = "Question: " + question.strip() + " Answer: "
            except KeyError:
                successful = False
                time.sleep(15.0)
        f.write(predicted_answer.replace("\n", " ").strip() + "\n")

100%|████████████████████████████████████████████████████████████████████████████| 3500/3500 [8:16:02<00:00,  8.50s/it]


In [12]:
def get_answer_from_raw_question_answer(qa, translate_with_question=False):
    if translate_with_question:
        pass
    else:
        answer = qa[qa.rfind("Answer:")+8:]
        if answer.strip() == "":
            return "Yes"
        result = translator.translate_text(answer, target_lang="PL", source_lang="EN")
        return result.text.strip()

In [13]:
with open('BLOOMAnswers/question_and_answer_raw.txt', 'r', encoding='UTF-8') as f_raw:
    with open('BLOOMAnswers/predicted_answers_polish.txt', 'a', encoding='UTF-8') as f_translated:
        for raw_line in tqdm(f_raw):
            f_translated.write(get_answer_from_raw_question_answer(raw_line.strip()) + "\n")

3500it [09:16,  6.29it/s]


# Test A

In [21]:
with open('BLOOMAnswers/predicted_answers_polish.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [22]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestA_Online.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [23]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    for _, answers in question_answers:
        f.write("\t".join(answers) + "\n")

In [24]:
!python advent_answer_check.py

TOTAL SCORE: 0.41828571428571426


# Test A'

In [28]:
with open('BLOOMAnswers/predicted_answers_polish.txt', 'r', encoding='UTF-8') as f:
    with open('found_answers.txt', 'w', encoding='UTF-8') as f_answers:
        k = 0
        for line in f:
            if k % 5 == 0:
                f_answers.write(line)
            k += 1

In [29]:
with open('found_answers.txt', 'r', encoding='UTF-8') as f:
    with open('FinalAnswers/TestAprime_Online.txt', 'w', encoding='UTF-8') as f_answers:
        for line in f:
            f_answers.write(line)

In [30]:
with open('correct_answers.txt', 'w', encoding='UTF-8') as f:
    k = 0
    for _, answers in question_answers:
        if k % 5 == 0:
            f.write("\t".join(answers) + "\n")
        k += 1

In [31]:
!python advent_answer_check.py

TOTAL SCORE: 0.4257142857142857
