In [10]:
import collections
import enum
import json
import os
import random

In [168]:
import nltk.tokenize

import numpy as np
import pandas as pd

In [12]:
import import_ipynb
import aux.defs
import aux.relation_extraction
import aux.utils
import aux.nlp
import preparation

%run explanation_04.ipynb

In [47]:
STATEMENTS_DIR = "/Users/YK/mt/project/statements_3/"
RACE_PART = "train/middle"
RACE_DIR = "/Users/YK/mt/RACE"
PARSED_RACE_DIR = "/Users/YK/mt/parsed/race"

In [119]:
class Position(enum.Enum):
    BEFORE = "before"
    NESTED = "nested"
    AFTER  = "after"

    
Alternative = collections.namedtuple(
    "Alternative",
    [
        "true_statement", 
        "alternative_statement",
        "relation_type",
        "position", 
        "distance_words",
        "distance_sentences",
        "sn_length",
        "sn_length_relative_difference",
        "jaccard_distance",
        "edit_distance"
    ]
)

In [161]:
def load_statements(directory, subdirectories, text_no):
    statements = {}
    for subdirectory in subdirectories:
        file_path = os.path.join(
            directory, subdirectory, RACE_PART, f"{text_no}.txt.tree"
        )
        if os.path.exists(file_path): 
            with open(file_path, "rt") as f:
                statements[subdirectory] = json.load(f)
    return statements
    

def load_relations(text_no, directory):
    file_path = os.path.join(directory, f"{text_no}.txt.tree")
    if os.path.exists(file_path):
        text, relations, _ = aux.relation_extraction.load_relations(
            os.path.join(directory, f"{text_no}.txt.tree")
        )
        return text, relations # {t: relations[t] for t in types if t in relations}
    else:
        return "", {}

In [154]:
statements_subdirectories = [
    f for f in os.listdir(STATEMENTS_DIR) if os.path.isdir(os.path.join(STATEMENTS_DIR, f))
]

In [162]:
def get_n_words(text_span):
    return len(nltk.tokenize.word_tokenize(text_span))


def get_n_sentences(text_span):
    cnt = 0
    for c in text_span:
        if c in {'.', ',', '!'}:
            cnt += 1
    return cnt


def get_position_and_distance(statement, relation, text, verbose=False):
    if relation.right.end <= statement["left_boundary"]:
        span = text[relation.right.end:statement["left_boundary"]]
        return (
            Position.BEFORE,
            get_n_words(span),
            get_n_sentences(span)
        )
    elif relation.left.start >= statement["right_boundary"]:
        span = text[statement["right_boundary"]:relation.left.start]
        return (
            Position.AFTER,
            get_n_words(span),
            get_n_sentences(span)
        )
    else:
        if (
            relation.left.start < statement["split_point"]
                and relation.right.end > statement["split_point"]
        ):
            if verbose:
                print("The relation overlaps with the relation of the true statement.")
            return None, None, None
        else:
            if relation.right.end <= statement["split_point"]:
                span = text[relation.right.end:statement["split_point"]]
                return Position.NESTED, get_n_words(span), get_n_sentences(span)
            else:
                span = (text[statement["split_point"]:relation.left.start])
                return Position.NESTED, get_n_words(span), get_n_sentences(span)

    
def get_jaccard_distance(phrase_1, phrase_2):
    tokens_1 = set(nltk.tokenize.word_tokenize(phrase_1))
    tokens_2 = set(nltk.tokenize.word_tokenize(phrase_2))
    return nltk.jaccard_distance(tokens_1, tokens_2)


def get_edit_distance(phrase_1, phrase_2):
    return nltk.edit_distance(phrase_1, phrase_2)
    

RelationData = collections.namedtuple(
    "RelationData",
    ["relation", "position", "distance_words", "distance_sentences"]
)


def get_k(relation_data_list, closest, k):
    sorted_relation_data_list = sorted(
        relation_data_list, key=lambda rd: rd.distance_words
    )
    if closest:
        return sorted_relation_data_list[:k]
    else:
        return sorted_relation_data_list[-k:]
        
        
def filter_relations(statement, relations, text, k=2):
    relation_data_lists = collections.defaultdict(list)
    for relation in relations:
        position, distance_words, distance_sentences = get_position_and_distance(
            statement, relation, text
        )
        if position is not None:
            relation_data_lists[position].append(
                RelationData(
                    relation=relation,
                    position=position,
                    distance_words=distance_words,
                    distance_sentences=distance_sentences
                )
            )
    
    result = []
    result += get_k(relation_data_lists[Position.BEFORE], closest=True, k=k)
    result += get_k(relation_data_lists[Position.AFTER], closest=True, k=k)
    result += get_k(relation_data_lists[Position.NESTED], closest=False, k=k)
    return result
    
    
def create_alternative(statement, relation_data, true_sn_text_len, text, verbose=False):
    relation_info = preparation.get_info(relation_data.relation, verbose)
    assert relation_info is not None
    if relation_info.satellite_info.relation is None:
        if verbose:
            print("Satellite is flat.")
        return None

    satellite_handling_result = preparation.Preprocessor.handle_satellite(
        text, relation_info.satellite_info, relation_info.nucleus_info.direction, verbose
    )
    if satellite_handling_result is None:
        if verbose:
            print("Satellite preprocessing was unsuccessful.")
        return None
    
    processed_sn_text = aux.nlp.take_first_sentence_and_remove_leading_words(
        satellite_handling_result.preparation_result.prepared_text, verbose
    )
    prepared_sn_text = utils.lowercase_first_letter(
        processed_sn_text if processed_sn_text is not None 
            else info.satellite_preparation_result.prepared_text
    )
    sn_text_len = get_n_words(prepared_sn_text)
    
    true_statement_nucleus = statement["nucleus"]
    connective = statement["connective"]
    alternative_text = f"{true_statement_nucleus}{connective}{prepared_sn_text}"
    return Alternative(
        true_statement=statement["statement_text"],
        alternative_statement=alternative_text,
        position=relation_data.position.value,
        relation_type=relation_data.relation.type,
        distance_words=relation_data.distance_words,
        distance_sentences=relation_data.distance_sentences,
        sn_length=sn_text_len,
        sn_length_relative_difference=(sn_text_len / true_sn_text_len - 1),
        jaccard_distance=get_jaccard_distance(
            statement["satellite_nucleus"], prepared_sn_text
        ),
        edit_distance=get_edit_distance(
            statement["satellite_nucleus"], prepared_sn_text
        )
    )

In [163]:
def generate_alternatives(text_no):
    rows = []
    
    text, relation_map = load_relations(
        text_no, os.path.join(PARSED_RACE_DIR, RACE_PART)
    )
    if len(relation_map) > 0:
        relations = [
            relation for _, relations in relation_map.items() for relation in relations
        ]
        statement_map = load_statements(STATEMENTS_DIR, statements_subdirectories, text_no)
        for _, statements in statement_map.items():
            for statement in statements:
                true_sn_text_len = get_n_words(statement["satellite_nucleus"])
                filtered_relation_data = filter_relations(statement, relations, text)
                for relation_data in filtered_relation_data:
                    alternative = create_alternative(
                        statement, relation_data, true_sn_text_len, text
                    )
                    if alternative is not None:
                        row_dict = alternative._asdict()
                        row_dict.update(
                            {
                                "text_no": text_no,
                                "rule": statement["rule"],
                                "reason": statement["reason"][1]
                            }
                        )
                        rows.append(
                            row_dict
                        )
    return rows


def create_df(rows):
    if len(rows) > 0:
        result_df = pd.DataFrame(rows)[
            [
                "text_no",
                "true_statement", 
                "alternative_statement",
                "relation_type",
                "position",        
                "distance_words",
                "distance_sentences",
                "sn_length",
                "sn_length_relative_difference",
                "jaccard_distance",
                "edit_distance",
                "rule",
                "reason"
            ]
        ]
        result_df["d"] = (
            result_df.distance_words 
                * (1 - 2 * (result_df.position == Position.NESTED).astype(int))
        )
        result_df.sort_values(
            ["text_no", "rule", "true_statement", "position", "d"], inplace=True
        )
        result_df.drop("d", 1, inplace=True)

        return result_df
    else:
        return None

In [144]:
text_numbers = [
    int(fn.split('.')[0]) for fn in os.listdir(os.path.join(RACE_DIR, RACE_PART))
        if fn[-4:] == ".txt"
]

In [164]:
# rows = []

for i, text_no in enumerate(text_numbers):
    if i < 90:
        continue
    print(f"[{i + 1:4d}/{len(text_numbers)}, text no. {text_no:4d}]")
    rows.extend(generate_alternatives(text_no))

result_df = create_df(rows)

[  91/6409, text no. 5786]
[  92/6409, text no. 7191]
[  93/6409, text no. 5976]
[  94/6409, text no. 7807]
[  95/6409, text no. 7813]
[  96/6409, text no. 3254]
[  97/6409, text no. 7032]
[  98/6409, text no. 1443]
[  99/6409, text no. 7754]
[ 100/6409, text no. 5143]
[ 101/6409, text no. 4249]
[ 102/6409, text no. 3526]
[ 103/6409, text no. 5157]
[ 104/6409, text no. 1331]
[ 105/6409, text no. 7026]
[ 106/6409, text no. 1457]
[ 107/6409, text no. 5631]
[ 108/6409, text no. 3240]
[ 109/6409, text no. 6310]
[ 110/6409, text no. 5619]
[ 111/6409, text no. 2176]
[ 112/6409, text no. 3268]
[ 113/6409, text no. 4507]
[ 114/6409, text no. 4261]
[ 115/6409, text no. 2610]
[ 116/6409, text no. 6476]
[ 117/6409, text no.  855]
[ 118/6409, text no.  699]
[ 119/6409, text no.  841]
[ 120/6409, text no. 4275]
[ 121/6409, text no. 2604]
[ 122/6409, text no. 2162]
[ 123/6409, text no. 4513]
[ 124/6409, text no. 6304]
[ 125/6409, text no. 1480]
[ 126/6409, text no.  114]
[ 127/6409, text no. 2189]
[

[ 396/6409, text no. 5236]
[ 397/6409, text no. 1050]
[ 398/6409, text no. 7421]
[ 399/6409, text no. 2571]
[ 400/6409, text no. 6717]
[ 401/6409, text no. 7409]
[ 402/6409, text no. 6071]
[ 403/6409, text no. 5578]
[ 404/6409, text no. 2217]
[ 405/6409, text no. 4666]
[ 406/6409, text no. 2203]
[ 407/6409, text no. 4672]
[ 408/6409, text no. 8048]
[ 409/6409, text no. 4114]
[ 410/6409, text no. 3690]
[ 411/6409, text no. 1087]
[ 412/6409, text no.  513]
[ 413/6409, text no. 1939]
[ 414/6409, text no. 4699]
[ 415/6409, text no. 4841]
[ 416/6409, text no. 5593]
[ 417/6409, text no. 4855]
[ 418/6409, text no.  261]
[ 419/6409, text no. 7384]
[ 420/6409, text no. 6924]
[ 421/6409, text no.  507]
[ 422/6409, text no. 1093]
[ 423/6409, text no. 3874]
[ 424/6409, text no.  249]
[ 425/6409, text no. 1905]
[ 426/6409, text no. 1911]
[ 427/6409, text no. 4869]
[ 428/6409, text no. 3860]
[ 429/6409, text no. 6918]
[ 430/6409, text no. 1520]
[ 431/6409, text no. 8262]
[ 432/6409, text no. 7151]
[

[ 700/6409, text no. 5948]
[ 701/6409, text no. 5790]
[ 702/6409, text no. 6299]
[ 703/6409, text no. 7187]
[ 704/6409, text no. 5784]
[ 705/6409, text no. 3493]
[ 706/6409, text no. 1284]
[ 707/6409, text no. 6500]
[ 708/6409, text no.  923]
[ 709/6409, text no. 4317]
[ 710/6409, text no. 3478]
[ 711/6409, text no. 2000]
[ 712/6409, text no. 4471]
[ 713/6409, text no. 7178]
[ 714/6409, text no. 1509]
[ 715/6409, text no. 6266]
[ 716/6409, text no. 6272]
[ 717/6409, text no. 4465]
[ 718/6409, text no. 2772]
[ 719/6409, text no. 6514]
[ 720/6409, text no.  937]
[ 721/6409, text no. 2982]
[ 722/6409, text no. 3444]
[ 723/6409, text no. 5035]
[ 724/6409, text no. 1253]
[ 725/6409, text no. 7144]
[ 726/6409, text no. 8277]
[ 727/6409, text no. 1535]
[ 728/6409, text no. 5753]
[ 729/6409, text no. 2028]
[ 730/6409, text no. 3336]
[ 731/6409, text no. 4459]
[ 732/6409, text no. 7150]
[ 733/6409, text no. 1521]
[ 734/6409, text no. 6528]
[ 735/6409, text no. 1247]
[ 736/6409, text no. 7636]
[

[1004/6409, text no. 3938]
[1005/6409, text no. 3910]
[1006/6409, text no. 4919]
[1007/6409, text no. 1861]
[1008/6409, text no.  339]
[1009/6409, text no. 1120]
[1010/6409, text no. 7551]
[1011/6409, text no. 3737]
[1012/6409, text no. 4058]
[1013/6409, text no. 5346]
[1014/6409, text no. 2429]
[1015/6409, text no. 5420]
[1016/6409, text no. 3051]
[1017/6409, text no. 8104]
[1018/6409, text no. 7237]
[1019/6409, text no. 6129]
[1020/6409, text no. 8110]
[1021/6409, text no. 7223]
[1022/6409, text no. 1652]
[1023/6409, text no. 5434]
[1024/6409, text no. 3045]
[1025/6409, text no. 3723]
[1026/6409, text no. 5352]
[1027/6409, text no. 1134]
[1028/6409, text no. 7545]
[1029/6409, text no. 6883]
[1030/6409, text no. 4064]
[1031/6409, text no. 2415]
[1032/6409, text no.  488]
[1033/6409, text no. 2373]
[1034/6409, text no. 4702]
[1035/6409, text no. 2367]
[1036/6409, text no. 4716]
[1037/6409, text no. 3079]
[1038/6409, text no. 6101]
[1039/6409, text no. 1108]
[1040/6409, text no. 7579]
[

[1309/6409, text no. 5620]
[1310/6409, text no. 3251]
[1311/6409, text no. 7037]
[1312/6409, text no. 6329]
[1313/6409, text no. 1446]
[1314/6409, text no. 1452]
[1315/6409, text no. 5634]
[1316/6409, text no. 3245]
[1317/6409, text no. 5152]
[1318/6409, text no. 1334]
[1319/6409, text no.   38]
[1320/6409, text no. 7745]
[1321/6409, text no.  878]
[1322/6409, text no. 7976]
[1323/6409, text no. 5807]
[1324/6409, text no. 5813]
[1325/6409, text no.  887]
[1326/6409, text no. 7962]
[1327/6409, text no. 5185]
[1328/6409, text no. 2832]
[1329/6409, text no. 7792]
[1330/6409, text no. 3292]
[1331/6409, text no. 3286]
[1332/6409, text no. 2198]
[1333/6409, text no.  105]
[1334/6409, text no. 1491]
[1335/6409, text no. 7786]
[1336/6409, text no.  663]
[1337/6409, text no. 6498]
[1338/6409, text no. 5191]
[1339/6409, text no. 2826]
[1340/6409, text no. 6117]
[1341/6409, text no. 6671]
[1342/6409, text no. 2417]
[1343/6409, text no. 5378]
[1344/6409, text no. 4066]
[1345/6409, text no. 3709]
[

[1613/6409, text no. 2263]
[1614/6409, text no. 4612]
[1615/6409, text no. 4174]
[1616/6409, text no. 2505]
[1617/6409, text no. 6763]
[1618/6409, text no. 6777]
[1619/6409, text no. 1018]
[1620/6409, text no. 7469]
[1621/6409, text no. 4160]
[1622/6409, text no. 2511]
[1623/6409, text no. 5518]
[1624/6409, text no. 3169]
[1625/6409, text no. 4606]
[1626/6409, text no. 6011]
[1627/6409, text no.  201]
[1628/6409, text no. 1795]
[1629/6409, text no. 3182]
[1630/6409, text no. 4835]
[1631/6409, text no. 5295]
[1632/6409, text no. 7482]
[1633/6409, text no.  567]
[1634/6409, text no. 6950]
[1635/6409, text no. 6788]
[1636/6409, text no.  573]
[1637/6409, text no. 5281]
[1638/6409, text no. 3196]
[1639/6409, text no. 2288]
[1640/6409, text no. 4821]
[1641/6409, text no. 1781]
[1642/6409, text no.  215]
[1643/6409, text no. 1959]
[1644/6409, text no. 4809]
[1645/6409, text no. 1971]
[1646/6409, text no. 6978]
[1647/6409, text no. 3800]
[1648/6409, text no. 3814]
[1649/6409, text no.  229]
[

[1917/6409, text no. 3381]
[1918/6409, text no. 5928]
[1919/6409, text no. 4405]
[1920/6409, text no. 6212]
[1921/6409, text no. 6574]
[1922/6409, text no.  957]
[1923/6409, text no. 4377]
[1924/6409, text no. 6560]
[1925/6409, text no.  943]
[1926/6409, text no. 7118]
[1927/6409, text no. 2060]
[1928/6409, text no. 4411]
[1929/6409, text no. 7130]
[1930/6409, text no. 8203]
[1931/6409, text no. 1541]
[1932/6409, text no. 5727]
[1933/6409, text no. 2048]
[1934/6409, text no. 3356]
[1935/6409, text no. 3430]
[1936/6409, text no. 5041]
[1937/6409, text no. 6548]
[1938/6409, text no. 1227]
[1939/6409, text no. 7642]
[1940/6409, text no. 3424]
[1941/6409, text no. 5055]
[1942/6409, text no. 5733]
[1943/6409, text no. 3342]
[1944/6409, text no. 7124]
[1945/6409, text no. 8217]
[1946/6409, text no.  228]
[1947/6409, text no. 1970]
[1948/6409, text no. 4808]
[1949/6409, text no. 3801]
[1950/6409, text no. 5280]
[1951/6409, text no. 3829]
[1952/6409, text no.  572]
[1953/6409, text no. 6789]
[

[2221/6409, text no. 1608]
[2222/6409, text no. 6167]
[2223/6409, text no. 2301]
[2224/6409, text no. 4770]
[2225/6409, text no. 4016]
[2226/6409, text no. 5308]
[2227/6409, text no. 6601]
[2228/6409, text no. 3989]
[2229/6409, text no. 5320]
[2230/6409, text no. 6629]
[2231/6409, text no. 1146]
[2232/6409, text no. 7537]
[2233/6409, text no. 7251]
[2234/6409, text no. 8162]
[2235/6409, text no. 1620]
[2236/6409, text no. 5446]
[2237/6409, text no. 2329]
[2238/6409, text no. 4980]
[2239/6409, text no. 3037]
[2240/6409, text no. 4758]
[2241/6409, text no. 5452]
[2242/6409, text no. 4994]
[2243/6409, text no. 3023]
[2244/6409, text no. 7245]
[2245/6409, text no. 1634]
[2246/6409, text no. 1152]
[2247/6409, text no. 7523]
[2248/6409, text no. 3745]
[2249/6409, text no. 5334]
[2250/6409, text no. 5863]
[2251/6409, text no. 7912]
[2252/6409, text no. 7906]
[2253/6409, text no. 5877]
[2254/6409, text no. 7090]
[2255/6409, text no. 5687]
[2256/6409, text no. 3590]
[2257/6409, text no. 2856]
[

[2530/6409, text no. 2846]
[2531/6409, text no.  603]
[2532/6409, text no. 1383]
[2533/6409, text no. 2852]
[2534/6409, text no.  171]
[2535/6409, text no. 6605]
[2536/6409, text no. 2463]
[2537/6409, text no. 4012]
[2538/6409, text no. 4774]
[2539/6409, text no.  398]
[2540/6409, text no. 6163]
[2541/6409, text no. 6177]
[2542/6409, text no. 7269]
[2543/6409, text no. 4760]
[2544/6409, text no. 2477]
[2545/6409, text no. 4006]
[2546/6409, text no. 3769]
[2547/6409, text no. 5330]
[2548/6409, text no. 1156]
[2549/6409, text no. 1630]
[2550/6409, text no. 8172]
[2551/6409, text no. 7241]
[2552/6409, text no. 3027]
[2553/6409, text no. 4748]
[2554/6409, text no. 5456]
[2555/6409, text no. 4990]
[2556/6409, text no. 2339]
[2557/6409, text no. 3033]
[2558/6409, text no. 5442]
[2559/6409, text no. 4984]
[2560/6409, text no. 1624]
[2561/6409, text no. 8166]
[2562/6409, text no. 7255]
[2563/6409, text no. 7533]
[2564/6409, text no. 3755]
[2565/6409, text no. 3966]
[2566/6409, text no. 1817]
[

[2837/6409, text no. 6202]
[2838/6409, text no.  947]
[2839/6409, text no. 6564]
[2840/6409, text no. 2702]
[2841/6409, text no. 4373]
[2842/6409, text no. 2716]
[2843/6409, text no. 5079]
[2844/6409, text no. 4367]
[2845/6409, text no. 3408]
[2846/6409, text no.  953]
[2847/6409, text no. 6570]
[2848/6409, text no. 1579]
[2849/6409, text no. 6216]
[2850/6409, text no. 7108]
[2851/6409, text no. 4401]
[2852/6409, text no. 2070]
[2853/6409, text no. 1551]
[2854/6409, text no. 7120]
[2855/6409, text no. 3346]
[2856/6409, text no. 4429]
[2857/6409, text no. 5737]
[2858/6409, text no. 5051]
[2859/6409, text no. 3420]
[2860/6409, text no. 1237]
[2861/6409, text no. 7652]
[2862/6409, text no. 1223]
[2863/6409, text no. 5045]
[2864/6409, text no. 3434]
[2865/6409, text no. 3352]
[2866/6409, text no. 1545]
[2867/6409, text no. 8207]
[2868/6409, text no. 7134]
[2869/6409, text no. 5910]
[2870/6409, text no. 2919]
[2871/6409, text no. 7861]
[2872/6409, text no.  984]
[2873/6409, text no.  748]
[

[3142/6409, text no. 1949]
[3143/6409, text no. 1791]
[3144/6409, text no. 4819]
[3145/6409, text no. 1961]
[3146/6409, text no. 6968]
[3147/6409, text no. 3810]
[3148/6409, text no. 3804]
[3149/6409, text no. 1975]
[3150/6409, text no.  239]
[3151/6409, text no. 3151]
[3152/6409, text no. 5520]
[3153/6409, text no. 1746]
[3154/6409, text no. 8004]
[3155/6409, text no. 7337]
[3156/6409, text no. 6997]
[3157/6409, text no. 7451]
[3158/6409, text no. 1020]
[3159/6409, text no. 2529]
[3160/6409, text no. 5246]
[3161/6409, text no. 3637]
[3162/6409, text no. 5252]
[3163/6409, text no. 3623]
[3164/6409, text no. 7445]
[3165/6409, text no. 1034]
[3166/6409, text no. 1752]
[3167/6409, text no. 8010]
[3168/6409, text no. 7323]
[3169/6409, text no. 5534]
[3170/6409, text no. 6015]
[3171/6409, text no. 2273]
[3172/6409, text no. 2515]
[3173/6409, text no. 6773]
[3174/6409, text no.  588]
[3175/6409, text no. 7479]
[3176/6409, text no. 6767]
[3177/6409, text no. 1008]
[3178/6409, text no. 2501]
[

[3446/6409, text no. 1399]
[3447/6409, text no. 7930]
[3448/6409, text no.   95]
[3449/6409, text no. 6390]
[3450/6409, text no. 5699]
[3451/6409, text no. 4587]
[3452/6409, text no. 5841]
[3453/6409, text no.  143]
[3454/6409, text no. 2860]
[3455/6409, text no. 2874]
[3456/6409, text no. 6347]
[3457/6409, text no. 1428]
[3458/6409, text no. 4550]
[3459/6409, text no. 5896]
[3460/6409, text no. 2121]
[3461/6409, text no. 5128]
[3462/6409, text no. 2647]
[3463/6409, text no. 3559]
[3464/6409, text no. 4236]
[3465/6409, text no.  802]
[3466/6409, text no.   42]
[3467/6409, text no.   56]
[3468/6409, text no. 2653]
[3469/6409, text no. 4222]
[3470/6409, text no. 4544]
[3471/6409, text no.    4]
[3472/6409, text no. 5882]
[3473/6409, text no. 6353]
[3474/6409, text no. 3203]
[3475/6409, text no. 5672]
[3476/6409, text no.  180]
[3477/6409, text no. 1414]
[3478/6409, text no. 7065]
[3479/6409, text no. 1372]
[3480/6409, text no. 5114]
[3481/6409, text no. 3565]
[3482/6409, text no. 5100]
[

[3751/6409, text no. 7098]
[3752/6409, text no. 5857]
[3753/6409, text no. 4585]
[3754/6409, text no.  169]
[3755/6409, text no. 6392]
[3756/6409, text no. 7932]
[3757/6409, text no. 2692]
[3758/6409, text no. 5472]
[3759/6409, text no.  380]
[3760/6409, text no. 1614]
[3761/6409, text no. 8156]
[3762/6409, text no. 7265]
[3763/6409, text no. 1172]
[3764/6409, text no. 5314]
[3765/6409, text no. 3765]
[3766/6409, text no. 7517]
[3767/6409, text no. 1166]
[3768/6409, text no. 1600]
[3769/6409, text no.  394]
[3770/6409, text no. 8142]
[3771/6409, text no. 4778]
[3772/6409, text no. 3017]
[3773/6409, text no. 2309]
[3774/6409, text no. 5466]
[3775/6409, text no. 6147]
[3776/6409, text no. 1628]
[3777/6409, text no. 7259]
[3778/6409, text no. 4750]
[3779/6409, text no. 2321]
[3780/6409, text no. 3981]
[3781/6409, text no. 5328]
[3782/6409, text no. 3759]
[3783/6409, text no. 4036]
[3784/6409, text no. 6621]
[3785/6409, text no. 6635]
[3786/6409, text no. 3995]
[3787/6409, text no. 2453]
[

[4057/6409, text no.  971]
[4058/6409, text no. 7894]
[4059/6409, text no. 2734]
[4060/6409, text no. 4345]
[4061/6409, text no. 2720]
[4062/6409, text no. 4351]
[4063/6409, text no.  965]
[4064/6409, text no. 7658]
[4065/6409, text no. 6546]
[4066/6409, text no. 1229]
[4067/6409, text no. 6220]
[4068/6409, text no. 3358]
[4069/6409, text no. 4437]
[4070/6409, text no. 5729]
[4071/6409, text no. 2046]
[4072/6409, text no. 1567]
[4073/6409, text no. 7116]
[4074/6409, text no. 8225]
[4075/6409, text no. 3370]
[4076/6409, text no. 5701]
[4077/6409, text no. 2708]
[4078/6409, text no. 5067]
[4079/6409, text no. 4379]
[4080/6409, text no. 7670]
[4081/6409, text no.  795]
[4082/6409, text no. 7664]
[4083/6409, text no. 1215]
[4084/6409, text no. 5073]
[4085/6409, text no. 3364]
[4086/6409, text no. 5715]
[4087/6409, text no. 1573]
[4088/6409, text no. 7102]
[4089/6409, text no. 8231]
[4090/6409, text no. 2091]
[4091/6409, text no. 5926]
[4092/6409, text no. 5098]
[4093/6409, text no. 6591]
[

[4363/6409, text no. 4191]
[4364/6409, text no. 3826]
[4365/6409, text no. 4185]
[4366/6409, text no. 3832]
[4367/6409, text no. 6792]
[4368/6409, text no.  569]
[4369/6409, text no. 1943]
[4370/6409, text no. 3167]
[4371/6409, text no. 4608]
[4372/6409, text no. 5516]
[4373/6409, text no. 2279]
[4374/6409, text no. 1770]
[4375/6409, text no. 7467]
[4376/6409, text no. 6779]
[4377/6409, text no. 1016]
[4378/6409, text no. 3601]
[4379/6409, text no. 5264]
[4380/6409, text no. 3615]
[4381/6409, text no. 7473]
[4382/6409, text no. 1002]
[4383/6409, text no.  596]
[4384/6409, text no. 1764]
[4385/6409, text no. 3173]
[4386/6409, text no. 5502]
[4387/6409, text no. 4634]
[4388/6409, text no. 2245]
[4389/6409, text no. 4152]
[4390/6409, text no. 6745]
[4391/6409, text no. 6989]
[4392/6409, text no. 6751]
[4393/6409, text no. 2537]
[4394/6409, text no. 5258]
[4395/6409, text no. 4146]
[4396/6409, text no. 3629]
[4397/6409, text no. 4620]
[4398/6409, text no. 2251]
[4399/6409, text no. 1758]
[

[4669/6409, text no.  146]
[4670/6409, text no. 5878]
[4671/6409, text no.  152]
[4672/6409, text no. 7909]
[4673/6409, text no.  634]
[4674/6409, text no. 2871]
[4675/6409, text no. 7921]
[4676/6409, text no. 2681]
[4677/6409, text no. 5688]
[4678/6409, text no. 5850]
[4679/6409, text no. 4596]
[4680/6409, text no. 6381]
[4681/6409, text no. 5844]
[4682/6409, text no. 4582]
[4683/6409, text no. 2695]
[4684/6409, text no.  608]
[4685/6409, text no. 7935]
[4686/6409, text no. 7706]
[4687/6409, text no. 6418]
[4688/6409, text no. 1377]
[4689/6409, text no. 5111]
[4690/6409, text no. 3560]
[4691/6409, text no. 3206]
[4692/6409, text no. 4569]
[4693/6409, text no.  185]
[4694/6409, text no. 1411]
[4695/6409, text no. 7060]
[4696/6409, text no. 1405]
[4697/6409, text no.  191]
[4698/6409, text no. 7074]
[4699/6409, text no. 3212]
[4700/6409, text no. 5663]
[4701/6409, text no. 5105]
[4702/6409, text no. 3574]
[4703/6409, text no. 7712]
[4704/6409, text no. 1363]
[4705/6409, text no. 2642]
[

[4973/6409, text no. 7211]
[4974/6409, text no. 1674]
[4975/6409, text no. 7205]
[4976/6409, text no. 3063]
[4977/6409, text no. 5412]
[4978/6409, text no. 5374]
[4979/6409, text no. 3705]
[4980/6409, text no.  486]
[4981/6409, text no. 2433]
[4982/6409, text no. 4042]
[4983/6409, text no. 6655]
[4984/6409, text no. 6133]
[4985/6409, text no. 1884]
[4986/6409, text no. 4724]
[4987/6409, text no. 2355]
[4988/6409, text no. 4730]
[4989/6409, text no. 2341]
[4990/6409, text no. 6127]
[4991/6409, text no. 7239]
[4992/6409, text no. 1890]
[4993/6409, text no. 6899]
[4994/6409, text no. 6641]
[4995/6409, text no. 5348]
[4996/6409, text no. 3739]
[4997/6409, text no. 4056]
[4998/6409, text no.  451]
[4999/6409, text no.  337]
[5000/6409, text no. 4903]
[5001/6409, text no.  323]
[5002/6409, text no.  445]
[5003/6409, text no. 6696]
[5004/6409, text no. 7588]
[5005/6409, text no. 4081]
[5006/6409, text no. 3936]
[5007/6409, text no. 2396]
[5008/6409, text no. 1847]
[5009/6409, text no. 2382]
[

[5278/6409, text no. 6520]
[5279/6409, text no. 5029]
[5280/6409, text no. 2746]
[5281/6409, text no. 4337]
[5282/6409, text no. 4451]
[5283/6409, text no. 6246]
[5284/6409, text no. 1529]
[5285/6409, text no. 7158]
[5286/6409, text no. 2975]
[5287/6409, text no. 7819]
[5288/6409, text no. 2961]
[5289/6409, text no. 5968]
[5290/6409, text no. 5798]
[5291/6409, text no. 4486]
[5292/6409, text no. 5940]
[5293/6409, text no. 6291]
[5294/6409, text no. 7831]
[5295/6409, text no. 1298]
[5296/6409, text no. 2949]
[5297/6409, text no. 2791]
[5298/6409, text no. 2785]
[5299/6409, text no. 7825]
[5300/6409, text no.  718]
[5301/6409, text no. 6285]
[5302/6409, text no. 5613]
[5303/6409, text no. 3262]
[5304/6409, text no. 7004]
[5305/6409, text no. 1475]
[5306/6409, text no. 1313]
[5307/6409, text no.  687]
[5308/6409, text no. 7762]
[5309/6409, text no. 3504]
[5310/6409, text no. 3510]
[5311/6409, text no. 5161]
[5312/6409, text no. 6468]
[5313/6409, text no.  693]
[5314/6409, text no. 1307]
[

[5582/6409, text no. 2545]
[5583/6409, text no. 4134]
[5584/6409, text no. 3897]
[5585/6409, text no. 2551]
[5586/6409, text no. 4120]
[5587/6409, text no. 1058]
[5588/6409, text no. 6737]
[5589/6409, text no. 6051]
[5590/6409, text no. 3129]
[5591/6409, text no. 2237]
[5592/6409, text no. 6079]
[5593/6409, text no.  282]
[5594/6409, text no. 1716]
[5595/6409, text no. 8054]
[5596/6409, text no. 7367]
[5597/6409, text no. 3101]
[5598/6409, text no. 5570]
[5599/6409, text no. 5216]
[5600/6409, text no. 2579]
[5601/6409, text no. 7415]
[5602/6409, text no. 1064]
[5603/6409, text no. 5202]
[5604/6409, text no. 3115]
[5605/6409, text no. 5564]
[5606/6409, text no. 1702]
[5607/6409, text no.  296]
[5608/6409, text no. 7373]
[5609/6409, text no. 2233]
[5610/6409, text no. 4642]
[5611/6409, text no. 8078]
[5612/6409, text no. 6055]
[5613/6409, text no. 6733]
[5614/6409, text no. 4124]
[5615/6409, text no. 3893]
[5616/6409, text no. 2555]
[5617/6409, text no. 4130]
[5618/6409, text no. 2541]
[

[5886/6409, text no. 5830]
[5887/6409, text no. 3299]
[5888/6409, text no. 3272]
[5889/6409, text no. 5603]
[5890/6409, text no. 7014]
[5891/6409, text no. 7772]
[5892/6409, text no. 1303]
[5893/6409, text no.  697]
[5894/6409, text no. 5165]
[5895/6409, text no. 5171]
[5896/6409, text no. 3500]
[5897/6409, text no. 7766]
[5898/6409, text no. 6478]
[5899/6409, text no. 1317]
[5900/6409, text no. 1471]
[5901/6409, text no. 7000]
[5902/6409, text no. 4509]
[5903/6409, text no. 5617]
[5904/6409, text no. 2178]
[5905/6409, text no. 1459]
[5906/6409, text no. 6336]
[5907/6409, text no. 7028]
[5908/6409, text no. 4521]
[5909/6409, text no. 2150]
[5910/6409, text no. 2636]
[5911/6409, text no. 5159]
[5912/6409, text no. 4247]
[5913/6409, text no. 3528]
[5914/6409, text no.  873]
[5915/6409, text no. 6450]
[5916/6409, text no.   33]
[5917/6409, text no. 7996]
[5918/6409, text no.   27]
[5919/6409, text no. 6444]
[5920/6409, text no. 7982]
[5921/6409, text no. 2622]
[5922/6409, text no. 2144]
[

[6197/6409, text no. 3066]
[6198/6409, text no. 4709]
[6199/6409, text no. 5417]
[6200/6409, text no. 2378]
[6201/6409, text no. 1842]
[6202/6409, text no. 6693]
[6203/6409, text no.  468]
[6204/6409, text no. 3933]
[6205/6409, text no. 4090]
[6206/6409, text no. 3927]
[6207/6409, text no. 6687]
[6208/6409, text no. 1856]
[6209/6409, text no. 3099]
[6210/6409, text no.  332]
[6211/6409, text no.  454]
[6212/6409, text no. 6877]
[6213/6409, text no.  440]
[6214/6409, text no. 4912]
[6215/6409, text no.  326]
[6216/6409, text no. 6862]
[6217/6409, text no.  441]
[6218/6409, text no.  327]
[6219/6409, text no. 4913]
[6220/6409, text no. 4907]
[6221/6409, text no.  333]
[6222/6409, text no. 6876]
[6223/6409, text no. 6686]
[6224/6409, text no. 3926]
[6225/6409, text no. 4091]
[6226/6409, text no. 3098]
[6227/6409, text no. 2386]
[6228/6409, text no. 1857]
[6229/6409, text no. 1843]
[6230/6409, text no. 2392]
[6231/6409, text no. 3932]
[6232/6409, text no.  469]
[6233/6409, text no. 6692]
[

In [165]:
result_df.head()

Unnamed: 0,text_no,true_statement,alternative_statement,relation_type,position,distance_words,distance_sentences,sn_length,sn_length_relative_difference,jaccard_distance,edit_distance,rule,reason
12392,2,Drink the water that has not been boiled becau...,Drink the water that has not been boiled becau...,Evaluation,before,0,0,15,0.071429,0.826087,57,explanation_01,Satellite's nucleus contains 'because'.
12390,2,Drink the water that has not been boiled. That...,Drink the water that has not been boiled. That...,Evaluation,before,0,0,15,0.071429,0.826087,57,explanation_02,Common pattern; nucleus has nested relations.
12391,2,Drink the water that has not been boiled. More...,Drink the water that has not been boiled. More...,Evaluation,before,0,0,15,0.071429,0.826087,57,explanation_03,Satellite's nucleus contains 'in fact' / 'as a...
11786,3,There is not enough oil in the world now. That...,There is not enough oil in the world now. That...,Elaboration,nested,0,0,4,-0.826087,0.8,75,explanation_02,Common pattern; nucleus without nested relations.
11787,3,There is not enough oil in the world now. In f...,There is not enough oil in the world now. In f...,Elaboration,nested,0,0,4,-0.826087,0.8,75,explanation_04,Common pattern ( -Elaboration).


In [166]:
result_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"alternatives_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)

In [180]:
result_df.columns

Index(['text_no', 'true_statement', 'alternative_statement', 'relation_type',
       'position', 'distance_words', 'distance_sentences', 'sn_length',
       'sn_length_relative_difference', 'jaccard_distance', 'edit_distance',
       'rule', 'reason'],
      dtype='object')

In [205]:
sum(
    (result_df.jaccard_distance >= 0.3)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
)

10586

In [206]:
sum(
    (result_df.jaccard_distance >= 0.0)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
)

10597

In [200]:
reduced_df = result_df.loc[
    (result_df.sn_length <= 20)
        & (result_df.jaccard_distance >= 0.3)
        & (np.abs(result_df.sn_length_relative_difference) <= 0.5)
        & ((result_df.position != "nested") | (result_df.distance_words > 0))
]

selected_true_statements = np.random.choice(
    list(set(reduced_df.true_statement)), 200, replace=False
)

reduced_df = reduced_df.loc[
    reduced_df.true_statement.isin(set(selected_true_statements))
]

In [201]:
len(reduced_df) / len(result_df)

0.013650232856913441

In [202]:
len(reduced_df)

255

In [203]:
reduced_df.to_excel(
    os.path.join(
        STATEMENTS_DIR, 
        f"prefiltered_alternatives_{RACE_PART.replace('/', '-')}_{random.randint(0, 2**32):x}.xlsx"
    ),
    index=False
)