diff --git a/data/datasets/semantics_ws_qna_oa/data_process.py b/data/datasets/semantics_ws_qna_oa/data_process.py index 382912bee4..ec70bb2d7e 100644 --- a/data/datasets/semantics_ws_qna_oa/data_process.py +++ b/data/datasets/semantics_ws_qna_oa/data_process.py @@ -16,30 +16,15 @@ def create_qna(row): random_num = random.randint(0, 2) # extract rows' vals - con_type = row["Type"] lang = row["Language"] + con_type = row["Type"] word1 = row["Word1"] word2 = row["Word2"] score_percent = row["Score"] # 0 - yes; 1 - 50%, 2 - no - if con_type == "sim": - instruction = random_stuff.random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2) - else: - instruction = random_stuff.random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2) - if score_percent < 3.0 and con_type == "sim": - response = random_stuff.random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2) - elif score_percent < 3.0 and con_type == "rel": - response = random_stuff.random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2) - elif score_percent < 9 and con_type == "sim": - response = random_stuff.random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2) - elif score_percent < 9 and con_type == "rel": - response = random_stuff.random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2) - elif score_percent >= 9 and con_type == "sim": - response = random_stuff.random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2) - elif score_percent >= 9 and con_type == "rel": - response = random_stuff.random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2) - + instruction = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, True) + response = random_stuff.qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, False) source = "WordSim353" metadata = { "language": lang, diff --git a/data/datasets/semantics_ws_qna_oa/random_stuff.py b/data/datasets/semantics_ws_qna_oa/random_stuff.py index b76921a482..4096665a76 100644 --- a/data/datasets/semantics_ws_qna_oa/random_stuff.py +++ b/data/datasets/semantics_ws_qna_oa/random_stuff.py @@ -24,6 +24,28 @@ # sim_answers: +# same words; +random_list_sim_en_a_same = [ + "Yes, because it's the same word.", + 'Of course, we\'re talking about the same word: "{word1}".', + "You repeated '{word1}' twice.", +] +random_list_sim_ru_a_same = [ + "Да, ведь это одно и то же слово.", + 'Конечно, ведь речь идёт об одном слове: "{word1}".', + "Вы повторили '{word1}' дважды.", +] +random_list_sim_de_a_same = [ + "Ja, denn es ist dasselbe Wort.", + 'Natürlich, wir sprechen über dasselbe Wort: "{word1}".', + "Du hast '{word1}' zweimal wiederholt.", +] +random_list_sim_it_a_same = [ + "Sì, perché è la stessa parola.", + 'Certo, stiamo parlando della stessa parola: "{word1}".', + "Hai ripetuto '{word1}' due volte.", +] + # yes; random_list_sim_en_a_y = [ "Yes, {word1} and {word2} are synonymous.", @@ -40,13 +62,34 @@ '"{word1}" und "{word2}" sind Synonyme.', "Ja. Der Typ der Verbindung zwischen den Wörtern '{word1}' und '{word2}' ist synonym.", ] - random_list_sim_it_a_y = [ "Sì, {word1} e {word2} sono sinonimi.", '"{word1}" e "{word2}" sono sinonimi.', "Sì. Il tipo di connessione tra le parole '{word1}' e '{word2}' è sinonimico.", ] +# 75%; +random_list_sim_en_a_75 = [ + "There is a big conceptual meaning similarity between the words {word1} and {word2}, but they are not exactly synonymous.", + 'The words "{word1}" and "{word2}" share a significant similarity, but they cannot be considered true synonyms.', + "While {word1} and {word2} are not interchangeable, they do have a substantial overlap in meaning.", +] +random_list_sim_ru_a_75 = [ + "Между словами {word1} и {word2} есть большое сходство, но они не являются полноценными синонимами.", + 'Слова "{word1}" и "{word2}" имеют значительное сходство, но не могут считаться полноценными синонимами.', + "Хотя {word1} и {word2} не являются взаимозаменяемыми, у них есть значительное пересечение в значении.", +] +random_list_sim_de_a_75 = [ + "Es gibt eine große Ähnlichkeit zwischen den Wörtern {word1} und {word2}, aber sie sind nicht exakt synonym.", + 'DieWörter "{word1}" und "{word2}" weisen eine erhebliche Ähnlichkeit auf, können aber nicht als vollwertige Synonyme betrachtet werden.', + "Obwohl {word1} und {word2} nicht austauschbar sind, haben sie eine wesentliche Überlappung in der Bedeutung.", +] +random_list_sim_it_a_75 = [ + "C'è una grande somiglianza tra le parole {word1} e {word2}, ma non sono esattamente sinonimi.", + 'Le parole "{word1}" e "{word2}" condividono una notevole somiglianza, ma non possono essere considerate sinonimi veri e propri.', + "Anche se {word1} e {word2} non sono interscambiabili, hanno una sostanziale sovrapposizione di significato.", +] + # 50%; random_list_sim_en_a_50 = [ "There is some connection between the words {word1} and {word2}, but they are not full-fledged synonyms.", @@ -69,6 +112,28 @@ "Sì, c'è una connessione tra le parole '{word1}' e '{word2}', ma non possono essere chiamate sinonimi.", ] +# 25%; +random_list_sim_en_a_25 = [ + "No, {word1} and {word2} are not really synonymous, and they have very little conceptual meaning in common.", + 'The words "{word1}" and "{word2}" do not have the same meaning, and they share only a small amount of conceptual overlap.', + "While there is some similarity between {word1} and {word2}, they cannot be considered synonyms as their conceptual meaning has very little overlap.", +] +random_list_sim_ru_a_25 = [ + "Нет, {word1} и {word2} не являются совсем синонимами, и у них очень мало общего в плане концептуального значения.", + 'Слова "{word1}" и "{word2}" не имеют одинакового значения, и у них есть только небольшое концептуальное пересечение.', + "Хотя между {word1} и {word2} есть некоторое сходство, они не могут считаться синонимами, поскольку их концептуальное значение имеет очень мало общего.", +] +random_list_sim_de_a_25 = [ + "Nein, {word1} und {word2} sind nicht wirklich Synonyme, und sie haben sehr wenig konzeptionelle Bedeutung gemeinsam.", + 'Die Wörter "{word1}" und "{word2}" haben nicht dieselbe Bedeutung, und sie teilen nur eine geringe konzeptuelle Überschneidung.', + "Obwohl {word1} und {word2} einige Ähnlichkeiten aufweisen, können sie nicht als Synonyme betrachtetwerden, da ihr konzeptuelles Bedeutungsfeld nur sehr wenig gemeinsam hat.", +] +random_list_sim_it_a_25 = [ + "No, {word1} e {word2} non sono veri e propri sinonimi, e hanno molto poco in comune a livello concettuale.", + 'Le parole "{word1}" e "{word2}" non hanno lo stesso significato, e condividono solo una piccola quantità di sovrapposizione concettuale.', + "Anche se c'è una certa somiglianza tra {word1} e {word2}, non possono essere considerati sinonimi poiché il loro campo semantico ha molto poco in comune.", +] + # no; random_list_sim_en_a_n = [ "No, the words {word1} and {word2} are not synonyms.", @@ -116,6 +181,31 @@ # rel_answers: +# same words; +random_list_rel_en_a_same = [ + "Yes, because it's the same word.", + 'Of course, we\'re talking about the same word: "{word1}".', + "You repeated '{word1}' twice.", +] + +random_list_rel_ru_a_same = [ + "Да, ведь это одно и то же слово.", + 'Конечно, ведь речь идёт об одном слове: "{word1}".', + "Вы повторили '{word1}' дважды.", +] + +random_list_rel_de_a_same = [ + "Ja, denn es ist dasselbe Wort.", + 'Natürlich, wir sprechen über dasselbe Wort: "{word1}".', + "Du hast '{word1}' zweimal wiederholt.", +] + +random_list_rel_it_a_same = [ + "Sì, perché è la stessa parola.", + 'Certo, stiamo parlando della stessa parola: "{word1}".', + "Hai ripetuto '{word1}' due volte.", +] + # yes; random_list_rel_en_a_y = [ "Yes, there is an association between the words {word1} and {word2}.", @@ -138,6 +228,28 @@ "C'è un legame associativo diretto tra le parole '{word1}' e '{word2}'.", ] +# 75%; +random_list_rel_en_a_75 = [ + "There is a significant association between {word1} and {word2}, but the level of relatedness is not really high, about 75%.", + 'While "{word1}" and "{word2}" are related to some extent, their conceptual overlap is not very strong.', + "There is a moderate association between '{word1}' and '{word2}', indicating that they are related a lot, but not completely.", +] +random_list_rel_ru_a_75 = [ + "Между словами {word1} и {word2} существует значительная связь, но уровень связанности не превышает 75%.", + 'Хотя слова "{word1}" и "{word2}" имеют некоторую связь, их концептуальное сходство не так сильно высоко, чтобы их можно было назвать полностью ассоциативными.', + "Существует умеренная связь между словами '{word1}' и '{word2}', что указывает на то, что они сильно связаны между собой, но не полностью.", +] +random_list_rel_de_a_75 = [ + "Es besteht eine signifikante Assoziation zwischen {word1} und {word2}, aber das Maß der Verwandtschaft ist nicht sehr hoch.", + 'Obwohl "{word1}" und "{word2}" in gewisser Weise miteinander verbunden sind, ist ihre konzeptuelle Überlappung nicht sehr stark.', + "Es besteht eine mäßige Assoziation zwischen '{word1}' und '{word2}', was darauf hinweist, dass sie stark miteinander verbunden sind, aber nicht vollständig.", +] +random_list_rel_it_a_75 = [ + "C'è una significativa associazione tra {word1} e {word2}, ma il livello di relazione non è molto alto.", + 'Anche se "{word1}" e "{word2}" sono in qualche modo correlati, il loro sovrapporsi concettuale non è molto forte.', + "C'è una moderata associazione tra '{word1}' e '{word2}', indicando che sono molto correlati, ma non completamente.", +] + # 50%; random_list_rel_en_a_50 = [ "There is a slight association between the words {word1} and {word2}.", @@ -160,6 +272,28 @@ "C'è una certa associazione tra le parole '{word1}' e '{word2}'.", ] +# 25%; +random_list_rel_en_a_25 = [ + "There is very little conceptual related meaning in common between {word1} and {word2}, with a low level of relatedness.", + "The association between {word1} and {word2} is weak, suggesting that they are not very related.", + "While there is some association between {word1} and {word2}, the level of relatedness is quite low.", +] +random_list_rel_ru_a_25 = [ + "Между словами {word1} и {word2} очень мало общего в плане концептуальной связи, уровень связанности низкий.", + "Связь между словами {word1} и {word2} слабая, что указывает на то, что они не очень связаны между собой.", + "Хотя между словами {word1} и {word2} есть некоторая связь, уровень связанности довольно низкий.", +] +random_list_rel_de_a_25 = [ + "Es gibt sehr wenig konzeptuell verwandte Bedeutung zwischen den Wörtern {word1} und {word2}, mit einem niedrigen Verwandtheitsgrad.", + "Die Assoziation zwischen {word1} und {word2} ist schwach, was darauf hindeutet, dass sie nicht sehr verwandt sind.", + "Obwohl es eine gewisse Assoziation zwischen {word1} und {word2} gibt, ist das Maß der Verwandtschaft recht gering.", +] +random_list_rel_it_a_25 = [ + "C'è molto poco significato concettualmente correlato tra {word1} e {word2}, con un basso livello di correlazione.", + "L'associazione tra {word1} e {word2} è debole, suggerendo che non sono molto correlati.", + "Anche se c'è una certa associazione tra {word1} e {word2}, il livello di correlazione è piuttosto basso.", +] + # no; random_list_rel_en_a_n = [ "No, there is no associative relationship between the words {word1} and {word2}", @@ -201,49 +335,108 @@ } # dicts for a -# sim - random_dict_sim_a["ru"][0] # returns the list of "yes" answers for Russian +# sim - random_dict_sim_a["ru"][0 random_dict_sim_a = { "en": { - 0: random_list_sim_en_a_y, - 1: random_list_sim_en_a_50, - 2: random_list_sim_en_a_n, + 0: random_list_sim_en_a_same, + 1: random_list_sim_en_a_y, + 2: random_list_sim_en_a_75, + 3: random_list_sim_en_a_50, + 4: random_list_sim_en_a_25, + 5: random_list_sim_en_a_n, }, "ru": { - 0: random_list_sim_ru_a_y, - 1: random_list_sim_ru_a_50, - 2: random_list_sim_ru_a_n, + 0: random_list_sim_ru_a_same, + 1: random_list_sim_ru_a_y, + 2: random_list_sim_ru_a_75, + 3: random_list_sim_ru_a_50, + 4: random_list_sim_ru_a_25, + 5: random_list_sim_ru_a_n, }, "de": { - 0: random_list_sim_de_a_y, - 1: random_list_sim_de_a_50, - 2: random_list_sim_de_a_n, + 0: random_list_sim_de_a_same, + 1: random_list_sim_de_a_y, + 2: random_list_sim_de_a_75, + 3: random_list_sim_de_a_50, + 4: random_list_sim_de_a_25, + 5: random_list_sim_de_a_n, }, "it": { - 0: random_list_sim_it_a_y, - 1: random_list_sim_it_a_50, - 2: random_list_sim_it_a_n, + 0: random_list_sim_it_a_same, + 1: random_list_sim_it_a_y, + 2: random_list_sim_it_a_75, + 3: random_list_sim_it_a_50, + 4: random_list_sim_it_a_25, + 5: random_list_sim_it_a_n, }, } # rel random_dict_rel_a = { "en": { - 0: random_list_rel_en_a_y, - 1: random_list_rel_en_a_50, - 2: random_list_rel_en_a_n, + 0: random_list_rel_en_a_same, + 1: random_list_rel_en_a_y, + 2: random_list_rel_en_a_75, + 3: random_list_rel_en_a_50, + 4: random_list_rel_en_a_25, + 5: random_list_rel_en_a_n, }, "ru": { - 0: random_list_rel_ru_a_y, - 1: random_list_rel_ru_a_50, - 2: random_list_rel_ru_a_n, + 0: random_list_rel_ru_a_same, + 1: random_list_rel_ru_a_y, + 2: random_list_rel_ru_a_75, + 3: random_list_rel_ru_a_50, + 4: random_list_rel_ru_a_25, + 5: random_list_rel_ru_a_n, }, "de": { - 0: random_list_rel_de_a_y, - 1: random_list_rel_de_a_50, - 2: random_list_rel_de_a_n, + 0: random_list_rel_de_a_same, + 1: random_list_rel_de_a_y, + 2: random_list_rel_de_a_75, + 3: random_list_rel_de_a_50, + 4: random_list_rel_de_a_25, + 5: random_list_rel_de_a_n, }, "it": { - 0: random_list_rel_it_a_y, - 1: random_list_rel_it_a_50, - 2: random_list_rel_it_a_n, + 0: random_list_rel_it_a_same, + 1: random_list_rel_it_a_y, + 2: random_list_rel_it_a_75, + 3: random_list_rel_it_a_50, + 4: random_list_rel_it_a_25, + 5: random_list_rel_it_a_n, }, } + + +# 0 - same, 1 - yes, 2 - 75, 3 - 50, 4 - 25, 5 - no +def qna_random_magic(lang, word1, word2, con_type, score_percent, random_num, isQuestion): + if con_type == "sim": + instruction = random_dict_sim_q[lang][random_num].format(word1=word1, word2=word2) + else: + instruction = random_dict_rel_q[lang][random_num].format(word1=word1, word2=word2) + if score_percent < 1.85 and con_type == "sim": + response = random_dict_sim_a[lang][5][random_num].format(word1=word1, word2=word2) + elif score_percent < 1.85 and con_type == "rel": + response = random_dict_rel_a[lang][5][random_num].format(word1=word1, word2=word2) + elif score_percent < 3.85 and con_type == "sim": + response = random_dict_sim_a[lang][4][random_num].format(word1=word1, word2=word2) + elif score_percent < 3.85 and con_type == "rel": + response = random_dict_rel_a[lang][4][random_num].format(word1=word1, word2=word2) + elif score_percent < 6.3 and con_type == "sim": + response = random_dict_sim_a[lang][3][random_num].format(word1=word1, word2=word2) + elif score_percent < 6.3 and con_type == "rel": + response = random_dict_rel_a[lang][3][random_num].format(word1=word1, word2=word2) + elif score_percent < 8.85 and con_type == "sim": + response = random_dict_sim_a[lang][2][random_num].format(word1=word1, word2=word2) + elif score_percent < 8.85 and con_type == "rel": + response = random_dict_rel_a[lang][2][random_num].format(word1=word1, word2=word2) + elif score_percent < 10 and con_type == "sim": + response = random_dict_sim_a[lang][1][random_num].format(word1=word1, word2=word2) + elif score_percent < 10 and con_type == "rel": + response = random_dict_rel_a[lang][1][random_num].format(word1=word1, word2=word2) + elif score_percent == 10 and con_type == "sim": + response = random_dict_sim_a[lang][0][random_num].format(word1=word1, word2=word2) + elif score_percent == 10 and con_type == "rel": + response = random_dict_rel_a[lang][0][random_num].format(word1=word1, word2=word2) + else: + assert "Error" + return instruction if isQuestion else response