In [22]:
import json
import os

In [83]:
import nltk.tokenize
import numpy as np
import pandas as pd

In [147]:
ALTERNATIVES_XLSX_PATH = (
    "/Users/YK/mt/project/statements_3/prefiltered_alternatives_train-middle_3bb56a8a.xlsx"
)
RACE_PART = "train/middle"
RACE_DIR = "/Users/YK/mt/RACE"
OUTPUT_DIR = os.path.join("/Users/YK/mt/project/generated_rce/", RACE_PART)

In [40]:
reason_scores = pd.read_csv(
    "../ranking.csv", header=None
).rename(
    columns={0: "reason", 1: "score"}
)
reason_scores["score"] = reason_scores.score.apply(
    lambda s: s.replace(",", ".")
).astype(float)

In [41]:
df = pd.read_excel(ALTERNATIVES_XLSX_PATH)

In [42]:
set(reason_scores.reason) - set(df["reason"])

{"Nucleus starts with 'if' and its left subrelation is not 'Condition'.",
 "Satellite starts with 'so'.",
 "Satellite's nucleus starts with 'but'."}

In [43]:
set(df["reason"]) - set(reason_scores.reason)

{'Common pattern (Condition-Explanation).',
 'Common pattern (Elaboration-Joint).'}

In [44]:
df = pd.merge(df, reason_scores, on="reason")

In [131]:
def load_race(text_no):
    with open(os.path.join(RACE_DIR, RACE_PART, f"{text_no}.txt"), "rt") as f:
        return json.load(f)

In [118]:
def swap(array, i_1, i_2):
    buf = array[i_1]
    array[i_1] = array[i_2]
    array[i_2] = buf
    
    
def coalesce(value, default_value):
    if pd.isnull(value):
        return default_value
    else:
        return value
    
    
def generate_rce(
    true_statement, 
    before_alternatives, 
    nested_alternatives, 
    after_alternatives,
    swap_first_two,
    human_input_pos,
    true_statement_pos
):
    options = (
        list(
            coalesce(before_alternatives, tuple())
                + coalesce(nested_alternatives, tuple())
                + coalesce(after_alternatives, tuple())
        )[:2]
            + ["HUMAN_INPUT", true_statement]
    )
    if len(options) < 4:
        return None
    else:
        if swap_first_two:
            swap(options, 0, 1)
        swap(options, human_input_pos, 2)        
        swap(options, true_statement_pos, 3)
        return {
            "answer": ["A", "B", "C", "D"][true_statement_pos],
            "options": options
        }

In [119]:
grouped = df[
    ["text_no", "true_statement", "alternative_statement", "position"]
].pivot_table(
    values="alternative_statement",
    index=["text_no", "true_statement"],
    columns="position",
    aggfunc=tuple
)
grouped.columns.name = None
grouped = grouped.reset_index()

In [120]:
grouped["swap_first_two"] = np.random.rand(len(grouped)) < 0.5
grouped["human_input_pos"] = np.random.choice(range(3), len(grouped))
grouped["true_statement_pos"] = np.random.choice(range(4), len(grouped))

In [121]:
grouped["rce"] = grouped.apply(
    lambda row: generate_rce(
        row.true_statement,
        row.before,
        row.nested,
        row.after,
        row.swap_first_two,
        row.human_input_pos,
        row.true_statement_pos
    ),
    axis=1
)

In [140]:
merged = pd.merge(
    grouped.loc[~grouped.rce.isnull()],
    pd.merge(
        df.groupby("true_statement").reason.first().reset_index(),
        reason_scores,
        on="reason"
    ),
    on="true_statement"
)

In [141]:
def collect_rce(rce_group):
    generated_answers = []
    generated_options = []
    for rce in rce_group:
        generated_answers.append(rce["answer"])
        generated_options.append(rce["options"])
    return {
        "generated_answers": generated_answers,
        "generated_options": generated_options
    }

In [148]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [151]:
for text_no, group in merged.sort_values(
    by="score", ascending=False
).groupby(by="text_no"):
    race = load_race(text_no)
    race_n_exercises = len(race["answers"])
    if len(group) >= 1: # min(2, race_n_exercises):
        output = {
            "race": race,
            "generated": collect_rce(list(group.iloc[:race_n_exercises].rce))
        }
        with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "wt") as f:
            json.dump(output, f)