In [1]:
import collections
import enum
import hashlib
import json
import os
import random

In [2]:
import nltk.tokenize

import numpy as np
import pandas as pd

In [3]:
import import_ipynb
import aux.defs
import aux.relation_extraction
import aux.utils
import aux.nlp
import preparation

%run explanation_04.ipynb

importing Jupyter notebook from /Users/YK/mt/project/aux/defs.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/relation_extraction.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/utils.ipynb
importing Jupyter notebook from /Users/YK/mt/project/aux/nlp.ipynb
importing Jupyter notebook from preparation.ipynb
importing Jupyter notebook from rule_base.ipynb


In [4]:
STATEMENTS_DIR = "/Users/YK/mt/project/statements_5/"
RACE_PART = "test/middle"
RACE_DIR = "/Users/YK/mt/RACE"
PARSED_RACE_DIR = "/Users/YK/mt/parsed/race"

In [5]:
class Position(enum.Enum):
    BEFORE = "before"
    NESTED = "nested"
    AFTER  = "after"

    
Alternative = collections.namedtuple(
    "Alternative",
    [
        "true_statement",
        "nuclei_hash",
        "alternative_statement",
        "relation_type",
        "position", 
        "distance_words",
        "distance_sentences",
        "sn_length",
        "sn_length_relative_difference",
        "jaccard_distance",
        "edit_distance"
    ]
)

In [6]:
def load_statements(directory, subdirectories, text_no):
    statements = {}
    for subdirectory in subdirectories:
        file_path = os.path.join(
            directory, subdirectory, RACE_PART, f"{text_no}.txt.tree"
        )
        if os.path.exists(file_path): 
            with open(file_path, "rt") as f:
                statements[subdirectory] = json.load(f)
    return statements
    

def load_relations(text_no, directory):
    file_path = os.path.join(directory, f"{text_no}.txt.tree")
    if os.path.exists(file_path):
        text, relations, _ = aux.relation_extraction.load_relations(
            os.path.join(directory, f"{text_no}.txt.tree")
        )
        return text, relations # {t: relations[t] for t in types if t in relations}
    else:
        return "", {}

In [7]:
statements_subdirectories = [
    f for f in os.listdir(STATEMENTS_DIR) if os.path.isdir(os.path.join(STATEMENTS_DIR, f))
]

In [12]:
def get_n_words(text_span):
    return len(nltk.tokenize.word_tokenize(text_span))


def get_n_sentences(text_span):
    cnt = 0
    for c in text_span:
        if c in {'.', ',', '!'}:
            cnt += 1
    return cnt


def get_position_and_distance(statement, relation, text, verbose=False):
    if relation.right.end <= statement["left_boundary"]:
        span = text[relation.right.end:statement["left_boundary"]]
        return (
            Position.BEFORE,
            get_n_words(span),
            get_n_sentences(span)
        )
    elif relation.left.start >= statement["right_boundary"]:
        span = text[statement["right_boundary"]:relation.left.start]
        return (
            Position.AFTER,
            get_n_words(span),
            get_n_sentences(span)
        )
    else:
        if (
            relation.left.start < statement["split_point"]
                and relation.right.end > statement["split_point"]
        ):
            if verbose:
                print("The relation overlaps with the relation of the true statement.")
            return None, None, None
        else:
            if relation.right.end <= statement["split_point"]:
                span = text[relation.right.end:statement["split_point"]]
                return Position.NESTED, get_n_words(span), get_n_sentences(span)
            else:
                span = (text[statement["split_point"]:relation.left.start])
                return Position.NESTED, get_n_words(span), get_n_sentences(span)

    
def get_jaccard_distance(phrase_1, phrase_2):
    tokens_1 = set(nltk.tokenize.word_tokenize(phrase_1))
    tokens_2 = set(nltk.tokenize.word_tokenize(phrase_2))
    return nltk.jaccard_distance(tokens_1, tokens_2)


def get_edit_distance(phrase_1, phrase_2):
    return nltk.edit_distance(phrase_1, phrase_2)
    

RelationData = collections.namedtuple(
    "RelationData",
    ["relation", "position", "distance_words", "distance_sentences"]
)


def get_k(relation_data_list, closest, k):
    sorted_relation_data_list = sorted(
        relation_data_list, key=lambda rd: rd.distance_words
    )
    if closest:
        return sorted_relation_data_list[:k]
    else:
        return sorted_relation_data_list[-k:]
        
        
def filter_relations(statement, relations, text, k=2):
    relation_data_lists = collections.defaultdict(list)
    for relation in relations:
        position, distance_words, distance_sentences = get_position_and_distance(
            statement, relation, text
        )
        if position is not None:
            relation_data_lists[position].append(
                RelationData(
                    relation=relation,
                    position=position,
                    distance_words=distance_words,
                    distance_sentences=distance_sentences
                )
            )
    
    result = []
    result += get_k(relation_data_lists[Position.BEFORE], closest=True, k=k)
    result += get_k(relation_data_lists[Position.AFTER], closest=True, k=k)
    result += get_k(relation_data_lists[Position.NESTED], closest=False, k=k)
    return result
    

def compute_hash(string):
    return hashlib.md5(string.encode("utf-8")).hexdigest()


def create_alternative(statement, relation_data, true_sn_text_len, text, verbose=False):
    relation_info = preparation.get_info(relation_data.relation, verbose)
    assert relation_info is not None
    if relation_info.satellite_info.relation is None:
        if verbose:
            print("Satellite is flat.")
        return None

    satellite_handling_result = preparation.Preprocessor.handle_satellite(
        text, relation_info.satellite_info, relation_info.nucleus_info.direction, verbose
    )
    if satellite_handling_result is None:
        if verbose:
            print("Satellite preprocessing was unsuccessful.")
        return None
    
    prepared_sn_text = aux.nlp.take_first_sentence_and_remove_leading_words(
        satellite_handling_result.preparation_result.prepared_text, 
        lowercase_first_letter=True, 
        verbose=verbose
    )
    if prepared_sn_text is None:
        return None
    sn_text_len = get_n_words(prepared_sn_text)
    
    true_statement_nucleus = statement["nucleus"]
    connective = statement["connective"]
    alternative_text = f"{true_statement_nucleus}{connective}{prepared_sn_text}"
    return Alternative(
        true_statement=statement["statement_text"],
        nuclei_hash=compute_hash(statement["nucleus"] + statement["satellite_nucleus"]),
        alternative_statement=alternative_text,
        position=relation_data.position.value,
        relation_type=relation_data.relation.type,
        distance_words=relation_data.distance_words,
        distance_sentences=relation_data.distance_sentences,
        sn_length=sn_text_len,
        sn_length_relative_difference=(sn_text_len / true_sn_text_len - 1),
        jaccard_distance=get_jaccard_distance(
            statement["satellite_nucleus"], prepared_sn_text
        ),
        edit_distance=get_edit_distance(
            statement["satellite_nucleus"], prepared_sn_text
        )
    )

In [13]:
def generate_alternatives(text_no):
    rows = []
    
    text, relation_map = load_relations(
        text_no, os.path.join(PARSED_RACE_DIR, RACE_PART)
    )
    if len(relation_map) > 0:
        relations = [
            relation for _, relations in relation_map.items() for relation in relations
        ]
        statement_map = load_statements(STATEMENTS_DIR, statements_subdirectories, text_no)
        for _, statements in statement_map.items():
            for statement in statements:
                true_sn_text_len = get_n_words(statement["satellite_nucleus"])
                filtered_relation_data = filter_relations(statement, relations, text)
                for relation_data in filtered_relation_data:
                    alternative = create_alternative(
                        statement, relation_data, true_sn_text_len, text
                    )
                    if alternative is not None:
                        row_dict = alternative._asdict()
                        row_dict.update(
                            {
                                "text_no": text_no,
                                "rule": statement["rule"],
                                "reason": statement["reason"][1]
                            }
                        )
                        rows.append(
                            row_dict
                        )
    return rows


def create_df(rows):
    if len(rows) > 0:
        result_df = pd.DataFrame(rows)[
            [
                "text_no",
                "true_statement",
                "nuclei_hash",
                "alternative_statement",
                "relation_type",
                "position",        
                "distance_words",
                "distance_sentences",
                "sn_length",
                "sn_length_relative_difference",
                "jaccard_distance",
                "edit_distance",
                "rule",
                "reason"
            ]
        ]
        result_df["d"] = (
            result_df.distance_words 
                * (1 - 2 * (result_df.position == Position.NESTED).astype(int))
        )
        result_df.sort_values(
            ["text_no", "rule", "true_statement", "position", "d"], inplace=True
        )
        result_df.drop("d", 1, inplace=True)

        return result_df
    else:
        return None

In [14]:
text_numbers = [
    int(fn.split('.')[0]) for fn in os.listdir(os.path.join(RACE_DIR, RACE_PART))
        if fn[-4:] == ".txt"
]

In [15]:
rows = []

for i, text_no in enumerate(text_numbers):
    print(f"[{i + 1:4d}/{len(text_numbers)}, text no. {text_no:4d}]")
    rows.extend(generate_alternatives(text_no))

result_df = create_df(rows)

[   1/362, text no. 7344]
[   2/362, text no. 1047]
[   3/362, text no. 6700]
[   4/362, text no. 6927]
[   5/362, text no.  504]
[   6/362, text no. 8275]
[   7/362, text no. 3485]
[   8/362, text no.  869]
[   9/362, text no. 2638]
[  10/362, text no. 3297]
[  11/362, text no. 1127]
[  12/362, text no. 3056]
[  13/362, text no. 5433]
[  14/362, text no. 1133]
[  15/362, text no. 6853]
[  16/362, text no. 1696]
[  17/362, text no. 2375]
[  18/362, text no. 8102]
[  19/362, text no. 5195]
[  20/362, text no. 2163]
[  21/362, text no. 2177]
[  22/362, text no. 1522]
[  23/362, text no.  288]
[  24/362, text no. 2229]
[  25/362, text no. 6703]
[  26/362, text no. 2565]
[  27/362, text no. 3684]
[  28/362, text no. 1246]
[  29/362, text no. 3323]
[  30/362, text no. 6461]
[  31/362, text no. 7958]
[  32/362, text no. 5430]
[  33/362, text no. 1124]
[  34/362, text no. 3096]
[  35/362, text no. 3901]
[  36/362, text no. 6879]
[  37/362, text no.  499]
[  38/362, text no. 8101]
[  39/362, t

[ 318/362, text no. 6535]
[ 319/362, text no. 4322]
[ 320/362, text no. 5772]
[ 321/362, text no.  241]
[ 322/362, text no. 3868]
[ 323/362, text no. 6910]
[ 324/362, text no. 6723]
[ 325/362, text no. 7429]
[ 326/362, text no. 3111]
[ 327/362, text no.  251]
[ 328/362, text no. 3878]
[ 329/362, text no. 5038]
[ 330/362, text no.  912]
[ 331/362, text no. 2970]
[ 332/362, text no. 1470]
[ 333/362, text no. 2409]
[ 334/362, text no. 5399]
[ 335/362, text no. 3919]
[ 336/362, text no. 3702]
[ 337/362, text no.  867]
[ 338/362, text no. 4535]
[ 339/362, text no. 3474]
[ 340/362, text no. 5005]
[ 341/362, text no.  278]
[ 342/362, text no. 5207]
[ 343/362, text no. 7438]
[ 344/362, text no. 1049]
[ 345/362, text no. 3886]
[ 346/362, text no. 1711]
[ 347/362, text no.  118]
[ 348/362, text no. 2436]
[ 349/362, text no. 4735]
[ 350/362, text no. 2387]
[ 351/362, text no. 7598]
[ 352/362, text no. 3503]
[ 353/362, text no. 7765]
[ 354/362, text no. 1314]
[ 355/362, text no. 3259]
[ 356/362, t

In [16]:
result_df.head()

Unnamed: 0,text_no,true_statement,nuclei_hash,alternative_statement,relation_type,position,distance_words,distance_sentences,sn_length,sn_length_relative_difference,jaccard_distance,edit_distance,rule,reason
854,1,Wang has got used to it because ''I used to sp...,14d3480a7999933580fc6ff07a26a7d6,Wang has got used to it because she used to ge...,Elaboration,after,0,0,10,0.666667,0.769231,27,explanation_01,Nucleus starts with 'but'.
853,1,Wang has got used to it because ''I used to sp...,14d3480a7999933580fc6ff07a26a7d6,Wang has got used to it because ''I just looke...,Elaboration,before,14,1,13,1.166667,0.882353,41,explanation_01,Nucleus starts with 'but'.
855,1,Zhang got a grade of more than 80 because ''I ...,96d4256e267d4ae7aba2f8b088cf5c7c,Zhang got a grade of more than 80 because ''I ...,Explanation,before,20,4,6,-0.538462,0.882353,47,explanation_01,Nucleus starts with 'but'.
852,1,"Wang has got used to it, and ''I used to speak...",14d3480a7999933580fc6ff07a26a7d6,"Wang has got used to it, and she used to get a...",Elaboration,after,0,0,10,0.666667,0.769231,27,explanation_04,Common pattern (Whatever-Contrast).
851,1,"Wang has got used to it, and ''I used to speak...",14d3480a7999933580fc6ff07a26a7d6,"Wang has got used to it, and ''I just looked a...",Elaboration,before,14,1,13,1.166667,0.882353,41,explanation_04,Common pattern (Whatever-Contrast).


In [17]:
result_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"alternatives_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)

In [18]:
result_df.columns

Index(['text_no', 'true_statement', 'nuclei_hash', 'alternative_statement',
       'relation_type', 'position', 'distance_words', 'distance_sentences',
       'sn_length', 'sn_length_relative_difference', 'jaccard_distance',
       'edit_distance', 'rule', 'reason'],
      dtype='object')

In [19]:
sum(
    (result_df.jaccard_distance >= 0.3)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
)

672

In [20]:
sum(
    (result_df.jaccard_distance >= 0.0)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
)

672

In [21]:
reduced_df = result_df.loc[
    (result_df.sn_length <= 20)
        & (result_df.jaccard_distance >= 0.3)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
]

# selected_true_statements = np.random.choice(
#     list(set(reduced_df.true_statement)), 200, replace=False
# )

# reduced_df = reduced_df.loc[
#     reduced_df.true_statement.isin(set(selected_true_statements))
# ]

In [22]:
len(reduced_df) / len(result_df)

0.5423580786026201

In [23]:
len(reduced_df)

621

In [24]:
reduced_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"prefiltered_alternatives_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)