In [1]:
import json
import os

In [2]:
import nltk.tokenize
import numpy as np
import pandas as pd

In [3]:
ALTERNATIVES_XLSX_PATH = (
    "/Users/YK/mt/project/statements_7/paraphrased_test-middle_1021ac63.xlsx"
)
RACE_PART = "test/middle"
RACE_DIR = "/Users/YK/mt/RACE"
OUTPUT_DIR = os.path.join("/Users/YK/mt/project/generated_rce/", RACE_PART)

In [4]:
reason_scores = pd.read_csv(
    "../ranking.csv", header=None
).rename(
    columns={0: "reason", 1: "score"}
)
reason_scores["score"] = reason_scores.score.apply(
    lambda s: s.replace(",", ".")
).astype(float)

In [5]:
df = pd.read_excel(ALTERNATIVES_XLSX_PATH)

In [6]:
set(reason_scores.reason) - set(df["reason"])

{'Common pattern ( -Explanation->Condition).',
 'Common pattern (Elaboration-JOINT).'}

In [7]:
set(df["reason"]) - set(reason_scores.reason)

{'Common pattern (Condition-Explanation).', 'Common pattern (Nucleus=NN- ).'}

In [8]:
df = pd.merge(df, reason_scores, on="reason")

In [9]:
def load_race(text_no):
    with open(os.path.join(RACE_DIR, RACE_PART, f"{text_no}.txt"), "rt") as f:
        return json.load(f)

In [10]:
def swap(array, i_1, i_2):
    buf = array[i_1]
    array[i_1] = array[i_2]
    array[i_2] = buf
    
    
def coalesce(value, default_value):
    if pd.isnull(value):
        return default_value
    else:
        return value
    
    
def generate_rce(
    true_statement, 
    before_alternatives, 
    nested_alternatives, 
    after_alternatives,
    swap_first_two,
    human_input_pos,
    true_statement_pos
):
    options = (
        list(
            coalesce(before_alternatives, tuple())
                + coalesce(nested_alternatives, tuple())
                + coalesce(after_alternatives, tuple())
        )[:2]
            + ["HUMAN_INPUT", true_statement]
    )
    if len(options) < 4:
        return None
    else:
        if swap_first_two:
            swap(options, 0, 1)
        swap(options, human_input_pos, 2)        
        swap(options, true_statement_pos, 3)
        return {
            "answer": ["A", "B", "C", "D"][true_statement_pos],
            "options": options
        }

In [11]:
grouped = df[
    [
        "text_no", 
        "true_statement", 
        "nuclei_hash", 
        "synonym_paraphrased_alternative_statement",
        "position"
    ]
].pivot_table(
    values="synonym_paraphrased_alternative_statement",
    index=["text_no", "true_statement", "nuclei_hash"],
    columns="position",
    aggfunc=tuple
)
grouped.columns.name = None
grouped = grouped.reset_index()

In [12]:
grouped["swap_first_two"] = np.random.rand(len(grouped)) < 0.5
grouped["human_input_pos"] = np.random.choice(range(3), len(grouped))
grouped["true_statement_pos"] = np.random.choice(range(4), len(grouped))

In [13]:
grouped = pd.merge(
    grouped,
    df[
        ["true_statement", "synonym_paraphrased_true_statement"]
    ].groupby("true_statement").head(1),
    on="true_statement"
)

In [14]:
grouped["rce"] = grouped.apply(
    lambda row: generate_rce(
        row.synonym_paraphrased_true_statement,
        row.before,
        row.nested,
        row.after,
        row.swap_first_two,
        row.human_input_pos,
        row.true_statement_pos
    ),
    axis=1
)

In [15]:
grouped.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'after', 'before', 'nested',
       'swap_first_two', 'human_input_pos', 'true_statement_pos',
       'synonym_paraphrased_true_statement', 'rce'],
      dtype='object')

In [16]:
merged = pd.merge(
    grouped.loc[~grouped.rce.isnull()],
    pd.merge(
        df.groupby(
            ["true_statement"]
        ).reason.first().reset_index(),
        reason_scores,
        on="reason"
    ),
    on=["true_statement"]
)

In [17]:
merged.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'after', 'before', 'nested',
       'swap_first_two', 'human_input_pos', 'true_statement_pos',
       'synonym_paraphrased_true_statement', 'rce', 'reason', 'score'],
      dtype='object')

In [18]:
rce_df = merged[["text_no", "nuclei_hash", "rce", "score"]].sort_values(
    by="score", ascending=False
).groupby(
    ["text_no", "nuclei_hash"]
).head(
    1
).reset_index(
)

In [19]:
def collect_rce(rce_group):
    generated_answers = []
    generated_options = []
    for rce in rce_group:
        generated_answers.append(rce["answer"])
        generated_options.append(rce["options"])
    return {
        "generated_answers": generated_answers,
        "generated_options": generated_options
    }

In [20]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [21]:
for text_no, group in rce_df.sort_values(
    by="score", ascending=False
).groupby(by="text_no"):
    race = load_race(text_no)
    race_n_exercises = len(race["answers"])
    if len(group) >= 1: # min(2, race_n_exercises):
        output = {
            "race": race,
            "generated": collect_rce(list(group.iloc[:race_n_exercises].rce))
        }
        with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "wt") as f:
            json.dump(output, f)

In [22]:
def rce_to_str(text_no):
    output_chunks = []
    with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "rt") as f:
        rce = json.loads(f.read())
    output_chunks.append(f"TEXT No.{text_no}:\n\n{rce['race']['article']}\n\n")
    output_chunks.append(f"GENERATED QUESTIONS:\n\n")
    for i, options in enumerate(rce["generated"]["generated_options"]):
        output_chunks.append(f"{i + 1}. Which of the following is true?\n")
        answer = rce['generated']['generated_answers'][i]
        block = []
        for j, option in enumerate(options):
            current_letter = ['A', 'B', 'C', 'D'][j]
#             is_true = current_letter == answer
#             output_chunks.append(
#                 f"Statement #{j + 1} ({is_true}):\n{option}\n"
#             )
            block.append(f"{current_letter}. {option}")
        output_chunks.append("\n".join(block))
        output_chunks.append("")
        output_chunks.append(f"Answer: {answer}")
        output_chunks.append("")
    return "\n".join(output_chunks)

In [23]:
result = []
for text_no in rce_df.text_no.unique():
    file_path = os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt")
    if os.path.exists(file_path):
        with open(file_path, "rt") as f:
            rce = json.loads(f.read())
        result.append((text_no, len(rce["generated"]["generated_answers"])))
rce_stats_df = pd.DataFrame(result, columns=["text_no", "n_questions"])

In [24]:
rce_stats_df.n_questions.value_counts()

1    53
2    10
3     2
Name: n_questions, dtype: int64

#### Generating xlsx files

In [26]:
ordered_text_numbers = list(
    rce_stats_df.sort_values(
        by="n_questions", ascending=False
    ).text_no
)
first_tier_text_numbers = ordered_text_numbers[:12]
second_tier_text_number = ordered_text_numbers[12:]

In [27]:
first_tier_text_numbers

[2518, 3797, 7312, 1675, 1597, 6673, 3010, 2084, 7085, 4227, 2229, 432]

In [36]:
def create_xlsx_rce(text_no):
    with open(os.path.join(OUTPUT_DIR, f"{text_no}.rce.txt"), "rt") as f:
        rce = json.loads(f.read())
    rows = [f"Text No.{text_no}", rce["race"]["article"], ""]
    letters = ['A', 'B', 'C', 'D']
    for i, options in enumerate(rce["generated"]["generated_options"]):
        rows.append(f"Question No.{i + 1}")
        for j, option in enumerate(options):
            rows.append(f"({letters[j]}) {option}")
        rows.append("")
    return rows

In [49]:
def save_xlsx(text_numbers, output_dir, file_no):
    rows = []
    for text_no in text_numbers:
        rows.extend(create_xlsx_rce(text_no))
    df = pd.DataFrame(rows, columns=["-"])
    df["Is it true?"] = None
    df["Is it grammatically correct?"] = None
    df["How sure were you with the answer?"] = None
    df["How clear and concise is the statement?"] = None
    df["How sensible was the statement?"] = None
    df["How easy was it to answer the question?"] = None
    df["How important is the information asked in the statement?"] = None
    df.to_excel(os.path.join(output_dir, f"{file_no}.xlsx"), index=False)

In [50]:
xlsx_output_dir = os.path.join(OUTPUT_DIR, "excel_files")
os.makedirs(xlsx_output_dir, exist_ok=True)

In [53]:
save_xlsx([2518, 3436, 2512], xlsx_output_dir, 1)
save_xlsx([3797, 2299, 357], xlsx_output_dir, 2)
for file_no in range(3, 13):
    save_xlsx(
        (
            [first_tier_text_numbers[file_no - 1]]
                + second_tier_text_number[
                    (4 + (file_no - 3) * 3):(4 + (file_no - 3) * 3) + 3
                ]
        ),
        xlsx_output_dir,
        file_no
    )

In [29]:
rce_strings = [
    rce_to_str(text_no) 
    for text_no in rce_stats_df.loc[rce_stats_df.n_questions >= 2].text_no
]

In [30]:
with open(os.path.join(OUTPUT_DIR, "rce-2+q.txt"), "wt") as f:
    f.write("\n\n\n".join(rce_strings))

In [25]:
print(rce_to_str(3797))

TEXT No.3797:

Once a tiger was in a cage . Soon a good man went by. As soon as the tiger saw the man, the tiger began to cry. "Please! Please!" the tiger called. "Please, let me out." "No," said the good man. "If I do, you will eat me." "I will not eat you," the tiger said. "Please let me out."
The good man believed the tiger. He opened the door of the cage. The tiger jumped out. "How silly you are," the tiger laughed. "Now I am going to eat you." "Wait!" the man cried. "You ought not to eat me. Let us ask others what they think." "You may ask three others. " the tiger said.
The good man asked a tree. The tree said, "I give shade  . And yet I am cut down. Let the tiger eat you."
Next, the good man asked a bird. The bird said, "I hurt no one. Yet people hunt and kill me. Let the tiger eat you."
The last one that the good man asked was a road. The road said, "I don't care if the tiger eats you. People could not get along too well without me. Yet all day and all night people step on me w

In [None]:
# unpaired quotes need to be removed
# punctuation must be a part of the nucleus not a connective