In [1]:
import collections
import enum
import json
import os
import random

In [24]:
import nltk.tokenize

import pandas as pd

In [27]:
import import_ipynb
import aux.defs
import aux.relation_extraction
import aux.utils
import aux.nlp
import preparation

%run explanation_04.ipynb

In [4]:
STATEMENTS_DIR = "/Users/YK/mt/project/statements_2/"
RACE_PART = "train/middle"
RACE_DIR = "/Users/YK/mt/RACE"
PARSED_RACE_DIR = "/Users/YK/mt/parsed/race"

In [52]:
class Position(enum.Enum):
    BEFORE = "before",
    NESTED = "nested",
    AFTER = "after"

    
Alternative = collections.namedtuple(
    "Alternative",
    [
        "true_statement", 
        "alternative_statement",
        "position", 
        "distance_words",
        "distance_sentences",
        "sn_length",
        "sn_length_relative_difference"
    ]
)

In [6]:
def load_statements(directory, subdirectories, text_no):
    statements = {}
    for subdirectory in subdirectories:
        file_path = os.path.join(
            directory, subdirectory, RACE_PART, f"{text_no}.txt.tree"
        )
        if os.path.exists(file_path): 
            with open(file_path, "rt") as f:
                statements[subdirectory] = json.load(f)
    return statements
    

def load_relations(text_no, directory):
    text, relations, _ = aux.relation_extraction.load_relations(
        os.path.join(directory, f"{text_no}.txt.tree")
    )
    return text, relations # {t: relations[t] for t in types if t in relations}

In [7]:
statements_subdirectories = [
    f for f in os.listdir(STATEMENTS_DIR) if os.path.isdir(os.path.join(STATEMENTS_DIR, f))
]

In [90]:
connective_map = {
    "explanation_01": " because ",
    "explanation_02": ". That is why ",
    "explanation_03": ". Moreover ",
    "explanation_05": " but ",
    "explanation_06": ". Also, ",
    "explanation_07": " and ",
    "explanation_08": " then ",
}


def get_n_words(text_span):
    return len(nltk.tokenize.word_tokenize(text_span))


def get_n_sentences(text_span):
    cnt = 0
    for c in text_span:
        if c in {'.', ',', '!'}:
            cnt += 1
    return cnt


def get_position_and_distance(statement, relation, text):
    if relation.right.end <= statement["left_boundary"]:
        return (
            Position.BEFORE,
            get_n_words(text[relation.right.end:statement["left_boundary"]]),
            get_n_sentences(text[relation.right.end:statement["left_boundary"]])
        )
    elif relation.left.start >= statement["right_boundary"]:
        return (
            Position.AFTER,
            get_n_words(text[statement["right_boundary"]:relation.left.start]),
            get_n_sentences(text[statement["right_boundary"]:relation.left.start])
        )
    else:
        return None, None, None

    
def create_alternative(statement, relation, true_sn_text_len, text, verbose=False):
    relation_info = preparation.get_info(relation, verbose)
    assert relation_info is not None
    if relation_info.satellite_info.relation is None:
        if verbose:
            print("Satellite is flat.")
        return None
    
    position, distance_words, distance_sentences = get_position_and_distance(
        statement, relation, text
    )
    if position is None:
        if verbose:
            print("The relation is nested within the relation of the true statement.")
        return None
    
    satellite_handling_result = preparation.Preprocessor.handle_satellite(
        text, relation_info.satellite_info, relation_info.nucleus_info.direction, verbose
    )
    if satellite_handling_result is None:
        if verbose:
            print("Satellite preprocessing was unsuccessful.")
        return None
    
    processed_sn_text = aux.nlp.take_first_sentence_and_remove_leading_words(
        satellite_handling_result.preparation_result.prepared_text, verbose
    )
    prepared_sn_text = utils.lowercase_first_letter(
        processed_sn_text if processed_sn_text is not None 
            else info.satellite_preparation_result.prepared_text
    )
    sn_text_len = get_n_words(prepared_sn_text)
    
    true_statement_nucleus = statement["nucleus"]
    
    if statement["rule"] == RuleExplanation04.name:
        alternative_text = RuleExplanation04.create_statement_text(
           true_statement_nucleus , prepared_sn_text
        )
    else:
        if statement["rule"] in connective_map:
            connective = connective_map[statement["rule"]]
            alternative_text = f"{true_statement_nucleus}{connective}{prepared_sn_text}"
        else:
            alternative_text = None
    
    if alternative_text is None:
        return None
    else:
        return Alternative(
            true_statement=statement["statement_text"],
            alternative_statement=alternative_text,
            position=position,
            distance_words=distance_words,
            distance_sentences=distance_sentences,
            sn_length=sn_text_len,
            sn_length_relative_difference=(sn_text_len / true_sn_text_len - 1)
        )

In [91]:
text_no = 2111

statements = load_statements(STATEMENTS_DIR, statements_subdirectories, text_no)
text, relation_map = load_relations(
    text_no, os.path.join(PARSED_RACE_DIR, RACE_PART)
)

In [92]:
tmp_relation = relation_map["Elaboration"][8]
tmp_statement = statements["explanation_01"][0]
true_sn_text_len = get_n_words(tmp_statement["satellite_nucleus"])

In [93]:
create_alternative(tmp_statement, tmp_relation, true_sn_text_len, text, True)

Nucleus is on the left.
Satellite's nucleus is on the left.
Nuclei proximity is NucleusProximity.NEAR
Satellite's (left) nucleus contains '.', '!', '?', or ';'.
Will use the whole segment.
Text extracted from the satellite:
The Japanese called it " kaizen ", which means " improvement ". Maurer studied the idea and did some experiments with it.  " Kaizen " could possibly help people succeed in doing everything.
Taking the first sentence and removing leading words:
-- syntactic parsing result
 (ROOT
  (S
    (VP
      (NP
        (NP
          (NP (DT The) (JJ Japanese))
          (VP (VBN called)
            (NP (PRP it) ('' ''))))
        (PRN (FW kaizen) ('' '')
          (S (, ,)
            (NP (WDT which))
            (VP (VBZ means))
            ('' '')))
        (NP (NN improvement) ('' ''))))
    (. .)))
The Japanese called it " kaizen ", which means " improvement ". Maurer studied the idea and did some experiments with it.  " Kaizen " could possibly help people succeed in doing

Alternative(true_statement='But as you start to write down your hopes for the new year, you think about the last year because you excitedly write down all the changes you are going to make, but by the end of January those ideas get lost in your busy life.', alternative_statement="But as you start to write down your hopes for the new year, you think about the last year because which means'' improvement''.", position=<Position.AFTER: 'after'>, distance_words=81, distance_sentences=9, sn_length=6, sn_length_relative_difference=-0.7857142857142857)

In [None]:
for relation_type, relations in relation_map.items():
    for relation in relations:
        position = 

In [None]:
def generate_alternatives(text_no):
    pass

In [4]:
rule_names = [
    file_name for file_name in os.listdir(STATEMENTS_DIR) 
        if file_name[0] != '.' and os.path.isdir(os.path.join(STATEMENTS_DIR, file_name))
]

In [5]:
all_statements = []

for rule_name in rule_names:
    directory = os.path.join(STATEMENTS_DIR, rule_name, RACE_PART)
    for file_name in os.listdir(directory):
        with open(os.path.join(directory, file_name), "rt") as f:
            file_no_str = file_name.split(".", 1)[0]
#             text_file = LINK_TEMPLATE.format(
#                 link=f'file://{os.path.join(RACE_DIR, RACE_PART, file_no_str + ".txt")}',
#                 text=file_no_str + ".txt"
#             )
#             tree_file = LINK_TEMPLATE.format(
#                 link=f'file://{os.path.join(PARSED_RACE_DIR, RACE_PART, file_no_str + ".txt.tree")}',
#                 text=file_no_str + ".txt.tree"
#             )
            statements = json.load(f)
            for statement in statements:
                statement.update(
#                     {"text_file": text_file, "tree_file": tree_file}
                    {"text_number": file_no_str}
                )
            all_statements.extend(statements)

In [6]:
df = pd.DataFrame(all_statements)
df["reason_number"] = df.reason.apply(lambda r: r[0])
df.reason = df.reason.apply(lambda r: r[1])

In [7]:
df[
    [
        "rule", 
        "reason", 
        "statement_text", 
        "nucleus", 
        "satellite_nucleus", 
        "text_number",
        "nucleus_proximity",
        "left_boundary",
        "right_boundary",
        "reason_number"
    ]
].sort_values(
    by=["rule", "reason_number"]
).drop(
    "reason_number", 1
).to_excel(
    os.path.join(
        STATEMENTS_DIR, f"{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)