In [1]:
import json
import os

In [2]:
import nltk.tokenize
import numpy as np
import pandas as pd

In [7]:
ALTERNATIVES_XLSX_PATH = (
    "/Users/YK/mt/project/statements_5/paraphrased_test-middle_462603ae.xlsx"
)
RACE_PART = "test/middle"
RACE_DIR = "/Users/YK/mt/RACE"
OUTPUT_DIR = os.path.join("/Users/YK/mt/project/generated_rce/", RACE_PART)

In [36]:
reason_scores = pd.read_csv(
    "../ranking.csv", header=None
).rename(
    columns={0: "reason", 1: "score"}
)
reason_scores["score"] = reason_scores.score.apply(
    lambda s: s.replace(",", ".")
).astype(float)

In [37]:
df = pd.read_excel(ALTERNATIVES_XLSX_PATH)

In [39]:
set(reason_scores.reason) - set(df["reason"])

{'Common pattern ( -Explanation->Condition).',
 'Common pattern (Elaboration-JOINT).'}

In [40]:
set(df["reason"]) - set(reason_scores.reason)

{'Common pattern (Condition-Explanation).', 'Common pattern (Nucleus=NN- ).'}

In [41]:
df = pd.merge(df, reason_scores, on="reason")

In [42]:
def load_race(text_no):
    with open(os.path.join(RACE_DIR, RACE_PART, f"{text_no}.txt"), "rt") as f:
        return json.load(f)

In [43]:
def swap(array, i_1, i_2):
    buf = array[i_1]
    array[i_1] = array[i_2]
    array[i_2] = buf
    
    
def coalesce(value, default_value):
    if pd.isnull(value):
        return default_value
    else:
        return value
    
    
def generate_rce(
    true_statement, 
    before_alternatives, 
    nested_alternatives, 
    after_alternatives,
    swap_first_two,
    human_input_pos,
    true_statement_pos
):
    options = (
        list(
            coalesce(before_alternatives, tuple())
                + coalesce(nested_alternatives, tuple())
                + coalesce(after_alternatives, tuple())
        )[:2]
            + ["HUMAN_INPUT", true_statement]
    )
    if len(options) < 4:
        return None
    else:
        if swap_first_two:
            swap(options, 0, 1)
        swap(options, human_input_pos, 2)        
        swap(options, true_statement_pos, 3)
        return {
            "answer": ["A", "B", "C", "D"][true_statement_pos],
            "options": options
        }

In [66]:
grouped = df[
    [
        "text_no", 
        "true_statement", 
        "nuclei_hash", 
        "synonym_paraphrased_alternative_statement",
        "position"
    ]
].pivot_table(
    values="synonym_paraphrased_alternative_statement",
    index=["text_no", "true_statement", "nuclei_hash"],
    columns="position",
    aggfunc=tuple
)
grouped.columns.name = None
grouped = grouped.reset_index()

In [68]:
grouped["swap_first_two"] = np.random.rand(len(grouped)) < 0.5
grouped["human_input_pos"] = np.random.choice(range(3), len(grouped))
grouped["true_statement_pos"] = np.random.choice(range(4), len(grouped))

In [69]:
grouped = pd.merge(
    grouped,
    df[
        ["true_statement", "synonym_paraphrased_true_statement"]
    ].groupby("true_statement").head(1),
    on="true_statement"
)

In [71]:
grouped["rce"] = grouped.apply(
    lambda row: generate_rce(
        row.synonym_paraphrased_true_statement,
        row.before,
        row.nested,
        row.after,
        row.swap_first_two,
        row.human_input_pos,
        row.true_statement_pos
    ),
    axis=1
)

In [72]:
grouped.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'after', 'before', 'nested',
       'swap_first_two', 'human_input_pos', 'true_statement_pos',
       'synonym_paraphrased_true_statement', 'rce'],
      dtype='object')

In [74]:
merged = pd.merge(
    grouped.loc[~grouped.rce.isnull()],
    pd.merge(
        df.groupby(
            ["true_statement"]
        ).reason.first().reset_index(),
        reason_scores,
        on="reason"
    ),
    on=["true_statement"]
)

In [75]:
merged.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'after', 'before', 'nested',
       'swap_first_two', 'human_input_pos', 'true_statement_pos',
       'synonym_paraphrased_true_statement', 'rce', 'reason', 'score'],
      dtype='object')

In [76]:
rce_df = merged[["text_no", "nuclei_hash", "rce", "score"]].sort_values(
    by="score", ascending=False
).groupby(
    ["text_no", "nuclei_hash"]
).head(
    1
).reset_index(
)

In [77]:
def collect_rce(rce_group):
    generated_answers = []
    generated_options = []
    for rce in rce_group:
        generated_answers.append(rce["answer"])
        generated_options.append(rce["options"])
    return {
        "generated_answers": generated_answers,
        "generated_options": generated_options
    }

In [79]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [80]:
for text_no, group in rce_df.sort_values(
    by="score", ascending=False
).groupby(by="text_no"):
    race = load_race(text_no)
    race_n_exercises = len(race["answers"])
    if len(group) >= 1: # min(2, race_n_exercises):
        output = {
            "race": race,
            "generated": collect_rce(list(group.iloc[:race_n_exercises].rce))
        }
        with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "wt") as f:
            json.dump(output, f)

In [81]:
def rce_to_str(text_no):
    output_chunks = []
    with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "rt") as f:
        rce = json.loads(f.read())
    output_chunks.append(f"TEXT No.{text_no}:\n\n{rce['race']['article']}\n\n")
    output_chunks.append(f"GENERATED QUESTIONS:\n\n")
    for i, options in enumerate(rce["generated"]["generated_options"]):
        output_chunks.append(f"{i + 1}. Which of the following is true?\n")
        answer = rce['generated']['generated_answers'][i]
        block = []
        for j, option in enumerate(options):
            current_letter = ['A', 'B', 'C', 'D'][j]
#             is_true = current_letter == answer
#             output_chunks.append(
#                 f"Statement #{j + 1} ({is_true}):\n{option}\n"
#             )
            block.append(f"{current_letter}. {option}")
        output_chunks.append("\n".join(block))
        output_chunks.append("")
        output_chunks.append(f"Answer: {answer}")
        output_chunks.append("")
    return "\n".join(output_chunks)

In [82]:
result = []
for text_no in rce_df.text_no.unique():
    file_path = os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt")
    if os.path.exists(file_path):
        with open(file_path, "rt") as f:
            rce = json.loads(f.read())
        result.append((text_no, len(rce["generated"]["generated_answers"])))
rce_stats_df = pd.DataFrame(result, columns=["text_no", "n_questions"])

In [85]:
rce_stats_df.n_questions.value_counts()

1    53
2     8
4     1
3     1
Name: n_questions, dtype: int64

In [83]:
rce_strings = [
    rce_to_str(text_no) 
    for text_no in rce_stats_df.loc[rce_stats_df.n_questions >= 2].text_no
]

In [84]:
with open(os.path.join(OUTPUT_DIR, "rce-2+q.txt"), "wt") as f:
    f.write("\n\n\n".join(rce_strings))

In [91]:
print(rce_to_str(785))

TEXT No.785:

Today is the fifth day of August. It is Judy's birthday. When she comes back home from school, she sees a card on the table. It says, "There's a present   for you, Judy. Look for it in your bedroom." Judy runs into her bedroom. Her parents are looking at her and _ . On the chair she sees a red box. She thinks her present must be in it. She opens it, and there is a piece of paper in it. She reads it, "Dear Judy, I'm your present. My first letter is in the word 'bag', but not in 'age'. My second letter is in 'like', but not in 'lake'. My third letter is in "know", but not in 'now'. And you can find my last letter in both 'desk' and 'get'. What am I?" Judy thinks for a while and says, "Aha, I know. But where is it?" Her father tells her it is in her study.
What is it? Do you know?
. (5)


GENERATED QUESTIONS:


1. Which of the following is true?

A. She sees a red box, and she thinks her present must be in it.
B. She sees a red box, and Judy runs into her bedroom.
C. HUMAN_I