# 1) Same Gateway Classification Analysis

In [2]:
# add parent dir to sys path for import of modules
import os
import sys

# find recursively the project root dir
parent_dir = str(os.getcwdb())
while not os.path.exists(os.path.join(parent_dir, "README.md")):
    parent_dir = os.path.abspath(os.path.join(parent_dir, os.pardir))
sys.path.insert(0, parent_dir)


In [3]:
import json

import numpy as np
import pandas as pd

from utils import ROOT_DIR
from PetReader import pet_reader
from petreader.labels import *
from labels import *

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

INFO:Utilities:Loaded config: {'general-seed': 42, 'keywords-filtered-approach': {'bert-model-name': 'distilbert-base-uncased', 'label-set': 'all', 'label-number': 9, 'other-labels-weight': 0.1, 'num-labels': 9}, 'same-gateway-classifier': {'context_label_length': 350}, 'synonym-samples-start-number': 500}
INFO:PetReader:Reload pet_reader from C:\Users\janek\Development\Git\master-thesis\data/other/pet_reader.pkl


## Load Data

In [5]:
with open("../../data/results/same_gateway_classified/key_words_custom_sg_classified_[e5_context_text_labels_ngram_c1_n0_syn]_fixed/sg_classifications.json", 'r') as file:
    predictions = json.load(file)
    
all_predictions = []
for doc_name, doc_predictions in predictions.items():
    all_predictions.extend([{**p, **{"doc_name": doc_name, "sentence_distance": abs(p["gateway_1"][0] - p["gateway_2"][0])}}
                             for p in doc_predictions])

df_all = pd.DataFrame(all_predictions)
df_all.info()
df_all.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gateway_1             146 non-null    object 
 1   gateway_2             146 non-null    object 
 2   label                 146 non-null    int64  
 3   predictions_averaged  146 non-null    float64
 4   predictions           146 non-null    object 
 5   comment               146 non-null    object 
 6   doc_name              146 non-null    object 
 7   sentence_distance     146 non-null    int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 9.2+ KB


Unnamed: 0,gateway_1,gateway_2,label,predictions_averaged,predictions,comment,doc_name,sentence_distance
0,"[0, 13, [if], [if]]","[2, 7, [whereas], [whereas]]",1,0.553544,"[0.39190834760665894, 0.4896750748157501, 0.45...",normal,doc-9.5,2
1,"[2, 7, [whereas], [whereas]]","[2, 11, [or], [or]]",1,0.598277,"[0.4482057988643646, 0.5356715321540833, 0.456...",normal,doc-9.5,0
2,"[2, 11, [or], [or]]","[3, 0, [In, case, of], [in, case, of]]",1,0.565167,"[0.4484378397464752, 0.500709056854248, 0.4589...",normal,doc-9.5,1
3,"[3, 0, [In, case, of], [in, case, of]]","[4, 0, [Otherwise], [otherwise]]",1,0.543442,"[0.41631197929382324, 0.46312791109085083, 0.4...",normal,doc-9.5,1
4,"[4, 0, [Otherwise], [otherwise]]","[5, 0, [If], [if]]",1,0.540609,"[0.37106990814208984, 0.4367409646511078, 0.45...",normal,doc-9.5,1


In [6]:
same_gateway_relations = {doc_name: pet_reader.get_doc_relations(doc_name)[SAME_GATEWAY] 
                          for doc_name in pet_reader.document_names}

In [7]:
def enrich_sg_relations(df):
    real_labels = []
    label_matches = []
    gateway_element_pairs = []
    gateway1_strings = []
    gateway2_strings = []

    for index, row in df.iterrows():
        # real label info
        doc_sg_relations = same_gateway_relations[row["doc_name"]]
        label = 0
        for sg_relation in doc_sg_relations:
            if sg_relation[SOURCE_ENTITY] == row["gateway_1"][2] and sg_relation[TARGET_ENTITY] == row["gateway_2"][2]:
                label = 1
        real_labels.append(label)
        label_matches.append(row["label"] == label)

        # gateway strings # {' '.join(row["gateway_1"][3])}
        g1_tokens = row["gateway_1"][3]
        g2_tokens = row["gateway_2"][3]
        g1 = row["gateway_1"]
        g2 = row["gateway_2"]
        gateway_element_pairs.append(f"{' '.join(g1_tokens)} ({g1[0]},{g1[1]}) "
                                     f"- {' '.join(g2_tokens)} ({g2[0]},{g2[1]})")
        gateway1_strings.append(' '.join(row["gateway_1"][3]))
        gateway2_strings.append(' '.join(row["gateway_2"][3]))

    temp_df = pd.DataFrame({"real_label": real_labels, "match": label_matches, "gateway_element_pair": gateway_element_pairs,
                            "gateway1_string": gateway1_strings, "gateway2_string": gateway2_strings})
    df_new = pd.concat([df, temp_df], axis=1)
    if 'comment' in df.columns:
        df_new = df_new[["doc_name", "gateway_1", "gateway_2", "label", "real_label", "match", "comment", "predictions_averaged", "predictions", 
                         "sentence_distance", "gateway_element_pair", "gateway1_string", "gateway2_string"]]
    else:
        df_new = df_new[["doc_name", "gateway_1", "gateway_2", "label", "real_label", "match", "predictions_averaged", "predictions", 
                     "sentence_distance", "gateway_element_pair", "gateway1_string", "gateway2_string"]]
    return df_new

df_all = enrich_sg_relations(df_all)
df_all.head(5)

Unnamed: 0,doc_name,gateway_1,gateway_2,label,real_label,match,comment,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
0,doc-9.5,"[0, 13, [if], [if]]","[2, 7, [whereas], [whereas]]",1,0,False,normal,0.553544,"[0.39190834760665894, 0.4896750748157501, 0.45...",2,"if (0,13) - whereas (2,7)",if,whereas
1,doc-9.5,"[2, 7, [whereas], [whereas]]","[2, 11, [or], [or]]",1,0,False,normal,0.598277,"[0.4482057988643646, 0.5356715321540833, 0.456...",0,"whereas (2,7) - or (2,11)",whereas,or
2,doc-9.5,"[2, 11, [or], [or]]","[3, 0, [In, case, of], [in, case, of]]",1,0,False,normal,0.565167,"[0.4484378397464752, 0.500709056854248, 0.4589...",1,"or (2,11) - in case of (3,0)",or,in case of
3,doc-9.5,"[3, 0, [In, case, of], [in, case, of]]","[4, 0, [Otherwise], [otherwise]]",1,1,True,normal,0.543442,"[0.41631197929382324, 0.46312791109085083, 0.4...",1,"in case of (3,0) - otherwise (4,0)",in case of,otherwise
4,doc-9.5,"[4, 0, [Otherwise], [otherwise]]","[5, 0, [If], [if]]",1,1,True,normal,0.540609,"[0.37106990814208984, 0.4367409646511078, 0.45...",1,"otherwise (4,0) - if (5,0)",otherwise,if


## Analysis

### 1) Wie oft welche Labels?

In [8]:
df_all.groupby("label")["label"].count()

label
0     43
1    103
Name: label, dtype: int64

In [99]:
df_all.groupby("real_label")["real_label"].count()

real_label
0    107
1     39
Name: real_label, dtype: int64

USE -> the model classifies more often a relation (43 non-related vs. 103 related), in reality more pairs are non-related (107 non-related vs. 39 related).

In [98]:
df_all.groupby(["label", "real_label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,match,predictions_averaged,predictions,sentence_distance
label,real_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,31,31,31,31,31,31,31
0,1,12,12,12,12,12,12,12
1,0,76,76,76,76,76,76,76
1,1,27,27,27,27,27,27,27


USE -> Due to this the accuracy is much better for non-related classifications (0.72) than for related classifications (0.26)

### 2) Predictions Variance

#### of predictions_averaged

In [9]:
df_all.groupby("label")["predictions_averaged"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,43.0,0.420243,0.062941,0.259559,0.378139,0.432495,0.474202,0.499536
1,103.0,0.566864,0.034454,0.500124,0.54383,0.564449,0.583598,0.650096


#### of var in single predictions

In [15]:
df_predictions_var = pd.DataFrame({'std': [np.std(p["predictions"]) for p in all_predictions],
                                   'var': [np.var(p["predictions"]) for p in all_predictions]})
df_predictions_var.describe()

Unnamed: 0,std,var
count,146.0,146.0
mean,0.135812,0.019083
std,0.025343,0.006797
min,0.07208,0.005195
25%,0.117471,0.0138
50%,0.140563,0.019758
75%,0.153867,0.023675
max,0.189206,0.035799


In [16]:
predictions1 = [p for p in all_predictions if p["label"] == 1]
predictions0 = [p for p in all_predictions if p["label"] == 0]

In [17]:
df_predictions1_var = pd.DataFrame({'std': [np.std(p["predictions"]) for p in predictions1],
                                   'var': [np.var(p["predictions"]) for p in predictions1]})
df_predictions1_var.describe()

Unnamed: 0,std,var
count,103.0,103.0
mean,0.144719,0.021303
std,0.019046,0.005434
min,0.093616,0.008764
25%,0.127437,0.01624
50%,0.147877,0.021868
75%,0.158021,0.024971
max,0.18803,0.035355


In [18]:
df_predictions0_var = pd.DataFrame({'std': [np.std(p["predictions"]) for p in predictions0],
                                   'var': [np.var(p["predictions"]) for p in predictions0]})
df_predictions0_var.describe()

Unnamed: 0,std,var
count,43.0,43.0
mean,0.114476,0.013765
std,0.025993,0.006839
min,0.07208,0.005195
25%,0.095734,0.009165
50%,0.111943,0.012531
75%,0.124039,0.015386
max,0.189206,0.035799


## 3) Positional Structure

### a) Sentence Distance

In [21]:
df_sentence_distance_stats = df_all.groupby(["label"])["sentence_distance"].describe()
df_sentence_distance_stats.to_excel("../../data/paper_stats/same_gateway_cls/sgc_sentence_distance_stats.xlsx", index=True)
df_sentence_distance_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,43.0,1.581395,1.828772,0.0,0.0,1.0,2.0,7.0
1,103.0,1.087379,1.103531,0.0,0.0,1.0,1.5,6.0


USE -> Higher sentence distances are classified by the model with a higher tendency as non-related.

In [22]:
df_sentence_distance_match_quality = df_all.groupby(["match"])["sentence_distance"].describe()
df_sentence_distance_match_quality.to_excel("../../data/paper_stats/same_gateway_cls/sgc_sentence_distance_match_quality.xlsx", index=True)
df_sentence_distance_match_quality

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,88.0,1.125,1.16276,0.0,0.0,1.0,2.0,6.0
True,58.0,1.396552,1.632345,0.0,0.0,1.0,2.0,7.0


-> Prediction quality is better for higher sentence distances. The model has it easier in this cases to distinguish related and non-related gateway pairs.

In [24]:
df_sentence_distance_label_stats = df_all.groupby(["match", "label"])["sentence_distance"].describe()
df_sentence_distance_label_stats.to_excel("../../data/paper_stats/same_gateway_cls/sgc_sentence_distance_label_stats.xlsx", index=True)
df_sentence_distance_label_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
match,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,0,12.0,1.083333,0.792961,0.0,1.0,1.0,1.0,3.0
False,1,76.0,1.131579,1.214821,0.0,0.0,1.0,2.0,6.0
True,0,31.0,1.774194,2.077012,0.0,0.0,1.0,2.5,7.0
True,1,27.0,0.962963,0.706099,0.0,1.0,1.0,1.0,3.0


USE -> Correctly classified gateway pairs tend to be closer in case of related pairs (mean sentence distance of 0.96) or be more distanced in case of non-related pairs (mean sentence distance of 1.77). In comparison the difference in the mean sentence distance of wrong predictions for both classes is much smaller (1.13 for related and 1.08 for non-related). This demonstrate that the distance between the gateways being evaluated is an important characteristic that can be used by the model.

In [91]:
df_all[df_all["label"] == 1].groupby("sentence_distance")["label"].count()

sentence_distance
0    31
1    46
2    19
3     4
4     1
6     2
Name: label, dtype: int64

USE -> Furthermore, analyzing the sentence distances in predictions revealed that three gateway pairs, that got classified as related, have sentence distance of > 3 (1x4, 2x6). This is an obvious error, because the maximum sentence distance of related gateways in the dataset is 3. Hence, a simple filtering rule applied after the classification for all pairs with a sentenc distance > 3 would improve the classification results.

### b) Position in sentence

In [38]:
df_all_positions = df_all.copy()
df_all_positions["gateway_1_idx"] = [r["gateway_1"][1]  for i,r in df_all_positions.iterrows()]
df_all_positions["gateway_2_idx"] = [r["gateway_2"][1] for i,r in df_all_positions.iterrows()]
df_all_positions["gateway1_sentence_start"] = [r["gateway_1"][1] == 0 for i,r in df_all_positions.iterrows()]
df_all_positions["gateway2_sentence_start"] = [r["gateway_2"][1] == 0 for i,r in df_all_positions.iterrows()]
df_all_positions.groupby(["label", "gateway1_sentence_start", "gateway2_sentence_start"])[["label"]].describe()
#df_all[(df_all["label"] == 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label,label,label,label,label,label,label,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max
label,gateway1_sentence_start,gateway2_sentence_start,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,False,False,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,False,True,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,True,False,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,True,True,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,36.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,False,True,21.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,True,False,22.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,True,True,24.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [39]:
df_all_positions.groupby(["real_label", "gateway1_sentence_start", "gateway2_sentence_start"])[["label"]].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label,label,label,label,label,label,label,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max
real_label,gateway1_sentence_start,gateway2_sentence_start,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,False,False,46.0,0.76087,0.431266,0.0,1.0,1.0,1.0,1.0
0,False,True,29.0,0.689655,0.470824,0.0,0.0,1.0,1.0,1.0
0,True,False,26.0,0.653846,0.485165,0.0,0.0,1.0,1.0,1.0
0,True,True,6.0,0.666667,0.516398,0.0,0.25,1.0,1.0,1.0
1,False,False,3.0,0.333333,0.57735,0.0,0.0,0.0,0.5,1.0
1,False,True,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
1,True,False,6.0,0.833333,0.408248,0.0,1.0,1.0,1.0,1.0
1,True,True,29.0,0.689655,0.470824,0.0,0.0,1.0,1.0,1.0


### 4) Konkrete Gateway Paare

In [140]:
df_all.groupby(["gateway_element_pair", "label", "match"]).count()# .groupby(["gateway_string", "label"]).aggregate(func)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,doc_name,gateway_1,gateway_2,real_label,predictions_averaged,predictions,sentence_distance,gateway1_string,gateway2_string
gateway_element_pair,label,match,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"((either,), (if,))",1,False,1,1,1,1,1,1,1,1,1
"((either,), (in, case, of))",1,False,1,1,1,1,1,1,1,1,1
"((either,), (or,))",0,True,1,1,1,1,1,1,1,1,1
"((either,), (or,))",1,False,1,1,1,1,1,1,1,1,1
"((either,), (or,))",1,True,1,1,1,1,1,1,1,1,1
"((for, each, patient, for, which), (only,))",0,True,1,1,1,1,1,1,1,1,1
"((for, the, case), (for, the, case))",1,True,2,2,2,2,2,2,2,2,2
"((for, the, case), (otherwise,))",1,False,1,1,1,1,1,1,1,1,1
"((if,), (either,))",0,True,1,1,1,1,1,1,1,1,1
"((if,), (either,))",1,False,2,2,2,2,2,2,2,2,2


USE -> Another hint that the model does not relay only on the n-gram pairs as a feature when taking the decision is that e.g. the widely used combination of 'if' -> 'if' is not always assigned to the same class. In 15 cases a pair of ('if' -> 'if') is classified with a same gateway relation, in 12 cases as non-related.

In [156]:
print("or", df_all[(df_all["gateway1_string"] == 'or') | (df_all["gateway2_string"] == 'or')].count()["doc_name"])
print("if", df_all[(df_all["gateway1_string"] == 'if') | (df_all["gateway2_string"] == 'if')].count()["doc_name"])

or 70
if 85


USE -> The most involved gateway phrases are 'if' and 'or' which are involved in 85 and 70 pairs.

In [152]:
df_all[(df_all["gateway1_string"] == 'or') & (df_all["gateway2_string"] == 'or')].groupby(["real_label", "label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,match,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
real_label,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,2,2,2,2,2,2,2,2,2,2
0,1,18,18,18,18,18,18,18,18,18,18


USE -> As introduced in section 4.2.1 (xor rules), 'or' works usually as a stand alone phrase that can indicate a gateway. Only in one document it is used in a same gateway with another keyword. But in 52 out of 70 cases a gateway pair involving at least one 'or' token is classified with a same gateway relation even though the gateways are not related. Neglecting this one sample in favor of overall performance, an after classification rule could be establised, that filters all same gateway relations involving an 'or' token. This is especially important if the extraction of gateways not works completely and may contain false positive 'or' gateway tokens.

In [151]:
df_all[(df_all["gateway1_string"] == 'or') | (df_all["gateway2_string"] == 'or')].groupby(["real_label", "label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,match,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
real_label,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,17,17,17,17,17,17,17,17,17,17
0,1,52,52,52,52,52,52,52,52,52,52
1,1,1,1,1,1,1,1,1,1,1,1


### 5) Welche Art von Samples funktionieren am besten?

#### label 1

In [84]:
df_sorted_label1 = df_all[df_all["label"] == 1].sort_values("predictions_averaged", ascending=False)
df_sorted_label1.head(20)

Unnamed: 0,doc_name,gateway_1,gateway_2,label,real_label,match,predictions_averaged,predictions,sentence_distance
9,doc-3.3,"[3, 0, [If], [if]]","[4, 0, [If], [if]]",1,1,True,0.650096,"[0.6054072976112366, 0.5772814750671387, 0.459...",1
8,doc-3.3,"[2, 17, [or], [or]]","[3, 0, [If], [if]]",1,0,False,0.648205,"[0.600063145160675, 0.6088016033172607, 0.4579...",1
78,doc-3.6,"[2, 0, [Otherwise], [otherwise]]","[3, 6, [or], [or]]",1,0,False,0.641953,"[0.6258020997047424, 0.587813138961792, 0.4588...",1
77,doc-3.6,"[1, 0, [If, not], [if, not]]","[2, 0, [Otherwise], [otherwise]]",1,0,False,0.638831,"[0.619398832321167, 0.6104929447174072, 0.4571...",1
79,doc-3.6,"[3, 6, [or], [or]]","[5, 0, [If], [if]]",1,0,False,0.633505,"[0.6322157382965088, 0.581949770450592, 0.4608...",2
80,doc-3.6,"[5, 0, [If], [if]]","[6, 0, [Otherwise], [otherwise]]",1,1,True,0.632723,"[0.5925711393356323, 0.5660861730575562, 0.456...",1
37,doc-10.2,"[8, 0, [If], [if]]","[9, 0, [If], [if]]",1,1,True,0.631204,"[0.6962703466415405, 0.5464240908622742, 0.466...",1
36,doc-10.2,"[7, 14, [if], [if]]","[8, 0, [If], [if]]",1,1,True,0.631025,"[0.7189033031463623, 0.5765438079833984, 0.464...",1
125,doc-5.1,"[4, 13, [or], [or]]","[5, 7, [or], [or]]",1,0,False,0.62436,"[0.46113142371177673, 0.6281872391700745, 0.45...",1
123,doc-5.1,"[4, 0, [If], [if]]","[4, 6, [or], [or]]",1,0,False,0.623473,"[0.4537869691848755, 0.6076458692550659, 0.452...",0


#### label 0

In [85]:
df_sorted_label0 = df_all[df_all["label"] == 0].sort_values("predictions_averaged", ascending=True)
df_sorted_label0.head(20)

Unnamed: 0,doc_name,gateway_1,gateway_2,label,real_label,match,predictions_averaged,predictions,sentence_distance
13,doc-6.1,"[7, 24, [if], [if]]","[9, 2, [if], [if]]",0,1,False,0.259559,"[0.24245518445968628, 0.16857124865055084, 0.4...",2
17,doc-6.1,"[10, 33, [or], [or]]","[14, 0, [If], [if]]",0,0,True,0.301928,"[0.24395005404949188, 0.17441482841968536, 0.4...",4
15,doc-6.1,"[9, 18, [or], [or]]","[10, 22, [if], [if]]",0,0,True,0.311987,"[0.26269295811653137, 0.1765115112066269, 0.44...",1
10,doc-6.1,"[0, 11, [or], [or]]","[6, 0, [Should], [should]]",0,0,True,0.324713,"[0.3848884105682373, 0.21421490609645844, 0.45...",6
14,doc-6.1,"[9, 2, [if], [if]]","[9, 18, [or], [or]]",0,0,True,0.328826,"[0.2923762798309326, 0.2161490023136139, 0.464...",0
16,doc-6.1,"[10, 22, [if], [if]]","[10, 33, [or], [or]]",0,0,True,0.33758,"[0.28437498211860657, 0.21744626760482788, 0.4...",0
44,doc-5.3,"[5, 0, [If], [if]]","[6, 0, [If], [if]]",0,1,False,0.340614,"[0.2700202465057373, 0.19460277259349823, 0.45...",1
12,doc-6.1,"[7, 23, [or], [or]]","[7, 24, [if], [if]]",0,0,True,0.350301,"[0.2906705439090729, 0.29827961325645447, 0.45...",0
45,doc-5.3,"[6, 0, [If], [if]]","[9, 9, [or], [or]]",0,0,True,0.352425,"[0.2749823331832886, 0.22853326797485352, 0.45...",3
11,doc-6.1,"[6, 0, [Should], [should]]","[7, 23, [or], [or]]",0,0,True,0.359389,"[0.31587454676628113, 0.30705299973487854, 0.4...",1


# 2) Applying Easy Rules additionally

## Load Data

In [17]:
with open("C:\\Users\\janek\\Development\\Git\\master-thesis\\data\\results\\same_gateway_classified\\key_words_custom_sg_classified_rules_[e5_context_text_labels_ngram_c1_n0_syn]\\sg_classifications.json",
          'r') as file:
    rules_predictions = json.load(file)
    
rules_all_predictions = []
for doc_name, doc_predictions in rules_predictions.items():
    for p in doc_predictions:
        #print(p["gateway_1"])
        if ELEMENT in p["gateway_1"]:
            p["gateway_1"] = p["gateway_1"][ELEMENT]
            p["gateway_2"] = p["gateway_2"][ELEMENT]
        rules_all_predictions.extend([{**p, **{"doc_name": doc_name, 
                                               "sentence_distance": abs(p["gateway_1"][0] - p["gateway_2"][0])}}])
        # print()
df_rules_all = pd.DataFrame(rules_all_predictions)
df_rules_all = enrich_sg_relations(df_rules_all)
df_rules_all.info()
df_rules_all.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   doc_name              146 non-null    object 
 1   gateway_1             146 non-null    object 
 2   gateway_2             146 non-null    object 
 3   label                 146 non-null    int64  
 4   real_label            146 non-null    int64  
 5   match                 146 non-null    bool   
 6   comment               146 non-null    object 
 7   predictions_averaged  146 non-null    float64
 8   predictions           146 non-null    object 
 9   sentence_distance     146 non-null    int64  
 10  gateway_element_pair  146 non-null    object 
 11  gateway1_string       146 non-null    object 
 12  gateway2_string       146 non-null    object 
dtypes: bool(1), float64(1), int64(3), object(8)
memory usage: 14.0+ KB


Unnamed: 0,doc_name,gateway_1,gateway_2,label,real_label,match,comment,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
0,doc-6.1,"[0, 11, [or], [or]]","[6, 0, [Should], [should]]",0,0,True,rule: sentence distance > 3,0.0,[0],6,"or (0,11) - should (6,0)",or,should
1,doc-6.1,"[6, 0, [Should], [should]]","[7, 23, [or], [or]]",0,0,True,rule: involves 'or',0.0,[0],1,"should (6,0) - or (7,23)",should,or
2,doc-6.1,"[7, 23, [or], [or]]","[7, 24, [if], [if]]",0,0,True,rule: involves 'or',0.0,[0],0,"or (7,23) - if (7,24)",or,if
3,doc-6.1,"[7, 24, [if], [if]]","[9, 2, [if], [if]]",0,1,False,normal,0.259559,"[0.24245518445968628, 0.16857124865055084, 0.4...",2,"if (7,24) - if (9,2)",if,if
4,doc-6.1,"[9, 2, [if], [if]]","[9, 18, [or], [or]]",0,0,True,rule: involves 'or',0.0,[0],0,"if (9,2) - or (9,18)",if,or


## Analysis

### A) Count new rules

In [192]:
df_rules_all.groupby(["label", "comment"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,real_label,match,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
label,comment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,normal,23,23,23,23,23,23,23,23,23,23,23
0,rule: involves 'or',65,65,65,65,65,65,65,65,65,65,65
0,rule: sentence distance > 3,10,10,10,10,10,10,10,10,10,10,10
1,normal,48,48,48,48,48,48,48,48,48,48,48


### B) Check label distribution

In [26]:
print("Old\n")
print(df_all.groupby("label")["label"].count())

print("\nNew")
df_rules_all.groupby("label")["label"].count()

Old

label
0     43
1    103
Name: label, dtype: int64

New


label
0    98
1    48
Name: label, dtype: int64

In [27]:
df_all.groupby(["label", "real_label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,match,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
label,real_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,31,31,31,31,31,31,31,31,31,31
0,1,12,12,12,12,12,12,12,12,12,12
1,0,76,76,76,76,76,76,76,76,76,76
1,1,27,27,27,27,27,27,27,27,27,27


In [28]:
df_rules_all.groupby(["label", "real_label"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_name,gateway_1,gateway_2,match,comment,predictions_averaged,predictions,sentence_distance,gateway_element_pair,gateway1_string,gateway2_string
label,real_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,85,85,85,85,85,85,85,85,85,85,85
0,1,13,13,13,13,13,13,13,13,13,13,13
1,0,22,22,22,22,22,22,22,22,22,22,22
1,1,26,26,26,26,26,26,26,26,26,26,26


### C) Check differences to old one -> Validate rules

In [18]:
df_merged = pd.merge(left=df_all, right=df_rules_all, 
                     on=['doc_name', "gateway_element_pair"])#.drop(['doc_name1', "gateway_element_pair1"], axis=1)
df_merged = df_merged[['doc_name', 'gateway_element_pair', 'real_label_x', 'label_x', 'match_x', 'label_y', 'match_y', 'comment']]
df_merged.head(5)

Unnamed: 0,doc_name,gateway_element_pair,real_label_x,label_x,match_x,label_y,match_y,comment
0,doc-3.5,"if (5,0) - otherwise (6,0)",1,1,True,1,True,normal
1,doc-3.8,"if (4,0) - otherwise (5,0)",1,1,True,1,True,normal
2,doc-3.8,"otherwise (5,0) - or (6,9)",0,1,False,0,True,rule: involves 'or'
3,doc-4.1,"for each patient for which (26,0) - only (33,7)",0,0,True,0,True,rule: sentence distance > 3
4,doc-4.1,"only (33,7) - if (33,15)",0,1,False,1,False,normal


In [30]:
df_diffs = df_merged[df_merged['label_x'] != df_merged['label_y']]
print(len(df_diffs))
df_diffs.head(10)

55


Unnamed: 0,doc_name,gateway_element_pair,real_label_x,label_x,match_x,label_y,match_y,comment
2,doc-3.8,"otherwise (5,0) - or (6,9)",0,1,False,0,True,rule: involves 'or'
7,doc-1.3,"if (10,8) - or (10,15)",0,1,False,0,True,rule: involves 'or'
8,doc-3.3,"or (2,17) - if (3,0)",0,1,False,0,True,rule: involves 'or'
23,doc-2.2,"in case of (11,0) - or (11,15)",0,1,False,0,True,rule: involves 'or'
24,doc-2.2,"or (11,15) - in case of (13,0)",0,1,False,0,True,rule: involves 'or'
25,doc-2.2,"in case of (13,0) - if (19,0)",0,1,False,0,True,rule: sentence distance > 3
27,doc-2.2,"either (20,19) - or (20,25)",1,1,True,0,False,rule: involves 'or'
28,doc-2.2,"or (20,25) - either (21,13)",0,1,False,0,True,rule: involves 'or'
30,doc-2.2,"in case of (21,16) - or (21,21)",0,1,False,0,True,rule: involves 'or'
31,doc-2.2,"or (21,21) - in case of (21,24)",0,1,False,0,True,rule: involves 'or'
