# Objective

- Propose a tutorial on how to use our library to mine differential causal rules on Knowledge Graphs

# 1 : Imports and Parameters Definition

## Libraries Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import math
import random
import copy
import sys
import itertools

In [2]:
import ampligraph
import tensorflow as tf

if ampligraph.__version__ == '1.4.0':
    print("AmpliGraph version OK")
if tf.__version__ == '1.15.2':
    print("TensorFlow version OK")

AmpliGraph version OK
TensorFlow version OK


In [3]:
from ampligraph.datasets import load_from_csv
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score
from ampligraph.evaluation import evaluate_performance
from ampligraph.latent_features.models import ConvE, DistMult, ComplEx, TransE, RandomBaseline

In [4]:
sys.path.append('../similarity_search')
from pairs_mining import *
from threshold_estimation import *
from distance_threshold_estimation import *
sys.path.append('../dcr_discovery')
from metrics import *

## Data Import

In [5]:
directory_path = '../datasets'
file_name = 'dbpedia_extract.csv'
X = load_from_csv(directory_path,file_name, sep=',')

In [6]:
# checking the import
print(f"The knowledge graph is composed of {len(X)} triples")

The knowledge graph is composed of 6908 triples


In [7]:
relations = list(np.unique([x[1] for x in X]))

In [8]:
relations_bug = []

In [9]:
X_try = [x for x in X if x[1] not in relations_bug]

Entrainement marche :
- avec ancienne version
- sans les 3 modifications
- avec birthDate
- avec arwuW

Entrainement marche PAS :
- avec les 3
- avec endowment
- avec arwuW ET birthDate

## Constant Values

In [10]:
PATH_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
TYPE_TARGET_CLASS = 'http://dbpedia.org/ontology/Writer'

PATH_TREATMENT = []
PATH_OUTCOME = ['http://dbpedia.org/ontology/releaseDate']

In [11]:
TEST_SET_PROPORTION = 0.1

SAMPLING_PARAMETER = 20
STAT_PARAM = 1.96
SUPPORT_THRESHOLD = 10
SIMILARITY_THRESHOLD = 0.75
EPOCHS = 150

# 2 : Embedding Model - Training sets Definition, Training and Metrics

- For this example, we decide to train the ConvE model
- However, we recommand the reader to visit the ampligraph library to test other models

In [13]:
size_before_sub = len(X_try)

In [15]:
# avec subsampling
X_try = [x for x in X_try if random.random() > 0.15]
size_after_sub = len(X_try)

In [16]:
print((size_before_sub-size_after_sub)/size_before_sub)

0.150839606253619


In [17]:
X_training = np.array(X_try)

In [18]:
# defining the training sets
# X_training = np.array([x for x in X if x[1] not in PATHS_TO_REMOVE_IN_TRAINING])

test_size = round(TEST_SET_PROPORTION*len(X_training))
X_train, X_test = train_test_split_no_unseen(X_training, test_size=test_size)

In [19]:
positives_filter = X_training

In [20]:
model_name = 'DistMult'
model = DistMult(batches_count=100,
                 seed=555,
                 epochs=EPOCHS,
                 k=150,
                 eta=5,
                 loss='pairwise',
                 loss_params={'margin':5})

In [21]:
# training the model
tf.logging.set_verbosity(tf.logging.ERROR)
model.fit(X_train, early_stopping = False)

In [22]:
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=positives_filter,
                             filter_unseen=True,
                             use_default_protocol=True,
                             verbose=True)



In [23]:
model_performances = {}

mrr = mrr_score(ranks)
model_performances['mrr'] = mrr

hits_10 = hits_at_n_score(ranks, n=10)
model_performances['hits_10'] = hits_10
hits_3 = hits_at_n_score(ranks, n=3)
model_performances['hits_3'] = hits_3
hits_1 = hits_at_n_score(ranks, n=1)
model_performances['hits_1'] = hits_1

In [24]:
model_performances = {key:[value] for key,value in model_performances.items()}
df_model_performances = pd.DataFrame.from_dict(model_performances)

In [25]:
df_model_performances

Unnamed: 0,mrr,hits_10,hits_3,hits_1
0,0.188698,0.260647,0.206985,0.143952


# 3 : Determining the distance threshold for matching 

- In this part, we sample a set of pairs and for each of them compute its (i) distance and (ii) similarity measure
- Then, we model the relation between the 2 measures and obtain a distance threshold based on the given parameter on the similarity

In [26]:
list_target_class_instances = [x[0] for x in X if x[1]==PATH_TYPE and x[2]==TYPE_TARGET_CLASS]

In [27]:
all_types = list(np.unique([x[2] for x in X if x[1] == PATH_TYPE]))
type_end = all_types

In [28]:
from parameters_DBPedia import dic_functionality
dic_functionality = {key:value for key,value in dic_functionality.items() if key not in [PATH_OUTCOME[0]]}

In [29]:
dic_functionality

{'http://dbpedia.org/ontology/arwuW': 1,
 'http://dbpedia.org/ontology/author': 1,
 'http://dbpedia.org/ontology/birthDate': 1,
 'http://dbpedia.org/ontology/countryName': 1,
 'http://dbpedia.org/ontology/endowment': 1,
 'http://dbpedia.org/ontology/genre': 1,
 'http://dbpedia.org/ontology/hasForStudent': 1,
 'http://dbpedia.org/ontology/isCountryOf': 1,
 'http://dbpedia.org/ontology/numberOfPages': 1,
 'http://xmlns.com/foaf/0.1/gender': 1}

## 3.1 : Compute similarity for two instances

In [30]:
def get_subjects_of_property(property_,X):
    """
    Returns possible objects for a property.
    """
    return list(np.unique([x[0] for x in X if x[1] == property_]))

In [31]:
def generate_array_triples_for_subject(object_,property_,subjects_):
    """
    Generates all possible triples in the appropriate format to assess their score afterwards.
    """
    return np.array([[s,property_,object_] for s in subjects_])

In [32]:
def create_df_values_scores_for_subject(subjects_,property_,object_,scores_):
    df_ = pd.DataFrame()
    for i in range(len(subjects_)):
        dic_add = {
            'subject':subjects_[i],
            'predicate':property_,
            'object':object_,
            'score':scores_[i]
        }
        df_ = df_.append(dic_add,ignore_index=True)
    df_ = df_.sort_values(by=['score'],ascending=False)
    return df_

In [33]:
def get_n_subjects_for_property_object(property_,object_,subjects_,model,func_):
    """
    Obtain the list of the top ranked subjects for a property p_ and an object given the model.
    """
    triples_ent_ = generate_array_triples_for_subject(object_,property_,subjects_)
    scores_ = model.predict(triples_ent_)
    df_ = create_df_values_scores_for_subject(subjects_,property_,object_,scores_)
    return list(df_['subject'])[:func_]

In [34]:
def get_endowment(endowment_text):
    if 'E' in endowment_text:
        return float(endowment_text.split('E')[0])*10**(int(endowment_text.split('E')[1]))
    else:
        return float(endowment_text)

In [35]:
def get_DBPedia_similarity(pair_,model,X,dic_functionality):
    similarity = []
    entity_0 = pair_[0]
    entity_1 = pair_[1]
    
    try:
        # birthdate
        p_ = 'http://dbpedia.org/ontology/birthDate'
        objects_for_p = get_objects_of_property(p_,X)
        birthdate_i0 = int(get_n_objects_for_property_entity(entity_0,p_,objects_for_p,model,dic_functionality[p_])[0])
        birthdate_i1 = int(get_n_objects_for_property_entity(entity_1,p_,objects_for_p,model,dic_functionality[p_])[0])
        if abs(birthdate_i0-birthdate_i1) <= 15:
            similarity.append(1)
        else:
            similarity.append(0)

        # genre
        p_ = 'http://dbpedia.org/ontology/genre'
        objects_for_p = get_objects_of_property(p_,X)
        genre_i0 = get_n_objects_for_property_entity(entity_0,p_,objects_for_p,model,dic_functionality[p_])[0]
        genre_i1 = get_n_objects_for_property_entity(entity_1,p_,objects_for_p,model,dic_functionality[p_])[0]
        if genre_i0 == genre_i1:
            similarity.append(1)
        else:
            similarity.append(0)

        # gender
        p_ = 'http://xmlns.com/foaf/0.1/gender'
        objects_for_p = get_objects_of_property(p_,X)
        gender_i0 = get_n_objects_for_property_entity(entity_0,p_,objects_for_p,model,dic_functionality[p_])[0]
        gender_i1 = get_n_objects_for_property_entity(entity_1,p_,objects_for_p,model,dic_functionality[p_])[0]
        if gender_i0 == gender_i1:
            similarity.append(1)
        else:
            similarity.append(0)

        # university
        p_ = 'http://dbpedia.org/ontology/hasForStudent'
        subjects_for_p = get_subjects_of_property(p_,X)
        uni_i0 = get_n_subjects_for_property_object(p_,entity_0,subjects_for_p,model,dic_functionality[p_])[0]
        uni_i1 = get_n_subjects_for_property_object(p_,entity_1,subjects_for_p,model,dic_functionality[p_])[0]
        if uni_i0 == uni_i1:
            similarity.append(1)
        else:
            # if different
            # arwuW
            p_ = 'http://dbpedia.org/ontology/arwuW'
            objects_for_p = get_objects_of_property(p_,X)
            arwu_i0 = int(get_n_objects_for_property_entity(uni_i0,p_,objects_for_p,model,dic_functionality[p_])[0])
            arwu_i1 = int(get_n_objects_for_property_entity(uni_i1,p_,objects_for_p,model,dic_functionality[p_])[0])

            if abs(arwu_i0-arwu_i1) <= 30:
                similarity.append(1)
            else:
                similarity.append(0)

            # endowment
            p_ = 'http://dbpedia.org/ontology/endowment'
            objects_for_p = get_objects_of_property(p_,X)
            endowment_i0 = get_endowment(get_n_objects_for_property_entity(uni_i0,p_,objects_for_p,model,dic_functionality[p_])[0])
            endowment_i1 = get_endowment(get_n_objects_for_property_entity(uni_i1,p_,objects_for_p,model,dic_functionality[p_])[0])
            min_ = min([endowment_i0,endowment_i1])
            max_ = max([endowment_i0,endowment_i1])
            if max_/min_ < 2:
                similarity.append(1)
            else:
                similarity.append(0)

            # country
            p_ = 'http://dbpedia.org/ontology/isCountryOf'
            subjects_for_p = get_subjects_of_property(p_,X)
            country_i0 = get_n_subjects_for_property_object(p_,uni_i0,subjects_for_p,model,dic_functionality[p_])[0]
            country_i0 = get_n_subjects_for_property_object(p_,uni_i1,subjects_for_p,model,dic_functionality[p_])[0]
            if gender_i0 == gender_i1:
                similarity.append(1)
            else:
                similarity.append(0)

        return sum(similarity)/len(similarity)
    except:
        return 0

In [36]:
list_pairs = list(itertools.combinations(list_target_class_instances,2))

In [37]:
def get_distributed_points(list_distance,list_similarity,number_intervals,number_per_interval):
    distance_ = max(list_distance) - min(list_distance)
    interval_length = distance_/(number_intervals-1)
    dic_points = {(min(list_distance)+interval_length*i,min(list_distance)+interval_length*(i+1)):[] for i in range(number_intervals-1)}
    
    # adding points
    for i in range(len(list_distance)):
        for interval, list_points in dic_points.items():
            distance_point = list_distance[i]
            if distance_point >= interval[0] and distance_point <= interval[1]:
                dic_points[interval] = dic_points[interval] + [i] 
                
    # subsampling
    dic_points_sampled = {}
    for interval, list_points in dic_points.items():
        if len(list_points) <= number_per_interval:
            dic_points_sampled[interval] = list_points
        else:
            dic_points_sampled[interval] = random.sample(list_points,number_per_interval)
                
    # getting points
    update_distance,update_similarity = [], []
    for interval, list_index in dic_points_sampled.items():
        for i in list_index:
            update_distance.append(list_distance[i])
            update_similarity.append(list_similarity[i])
    
    return update_distance,update_similarity

In [38]:
def get_treatment_DBPedia(instance,X,treatment_path):
    if treatment_path == 'http://dbpedia.org/ontology/birthDate':
        return int([(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0])
    elif treatment_path == 'http://xmlns.com/foaf/0.1/gender':
        return [(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0]
    elif treatment_path == 'http://dbpedia.org/ontology/genre':
        return [(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0]
    elif treatment_path == 'http://dbpedia.org/ontology/arwuW':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        rank_uni = [x[2] for x in X if x[0] == uni[0] and x[1] == 'http://dbpedia.org/ontology/arwuW']
        return int(rank_uni[0])
    elif treatment_path == 'http://dbpedia.org/ontology/countryName':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        country_uni = [x[0] for x in X if x[2] == uni[0] and x[1] == 'http://dbpedia.org/ontology/isCountryOf']
        return country_uni[0]
    elif treatment_path == 'http://dbpedia.org/ontology/endowment':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        endowment_uni = [x[2] for x in X if x[0] == uni[0] and x[1] == 'http://dbpedia.org/ontology/endowment']
        try:
            return get_endowment(endowment_uni[0])
        except:
            return 0

## 3.2 : Relation between distance and similarity metric

# 4 : Building the pairs of similar instances

- We mined the distance threshold, we can now create the pairs
- Showing the different strategies to obtain the pairs

In [39]:
print(f"There are {len(list_target_class_instances)} instances of the target class.")

There are 185 instances of the target class.


## 4.1 : Building the pairs of similar instances : No condition on the treatment

In [40]:
# building the similarity matrix
mode = 'mixed'
df_similarity,df_to_numpy = get_matrix_similarity_pairs(model,list_target_class_instances,mode=mode)

In [41]:
df_similarity.shape

(185, 185)

### 4.1.1 : Using the distance threshold

### 4.1.2 : Using the proportion of matched instances

In [42]:
proportion = 0.015
pairs_closer_instances = get_pairs_from_matrix_and_proportion(df_similarity,proportion=proportion)
print(f"{len(pairs_closer_instances)} pairs have been created.")

256 pairs have been created.


## 4.2 : Building the pairs of similar instances : Different treatment values

### 4.2.1 : Using the distance threshold

### 4.2.2 : Using the proportion of matched instances

# 5 : Computing the treatment effect

- Given the set of similar pairs, we aim to compute the treatment effect

### 5.1 : Getting treatment and outcome values

In [43]:
def get_outcome_DBPedia(instance,X):
    date_birth = [(x[2]) for x in X if x[0] == instance and x[1] == 'http://dbpedia.org/ontology/birthDate'][0]
    books_published = [(x[2]) for x in X if x[0] == instance and x[1] == 'http://dbpedia.org/ontology/author']
    dates_published = [x[2] for book in books_published for x in X if x[0] == book and x[1]=='http://dbpedia.org/ontology/releaseDate']
    return min([int(date) for date in dates_published]) - int(date_birth)

In [44]:
def get_endowment(endowment_text):
    if 'E' in endowment_text:
        return float(endowment_text.split('E')[0])*10**(int(endowment_text.split('E')[1]))
    else:
        return float(endowment_text)

In [45]:
def get_treatment_DBPedia(instance,X,treatment_path):
    if treatment_path == 'http://dbpedia.org/ontology/birthDate':
        return int([(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0])
    elif treatment_path == 'http://xmlns.com/foaf/0.1/gender':
        return [(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0]
    elif treatment_path == 'http://dbpedia.org/ontology/genre':
        return [(x[2]) for x in X if x[0] == instance and x[1] == treatment_path][0]
    elif treatment_path == 'http://dbpedia.org/ontology/arwuW':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        rank_uni = [x[2] for x in X if x[0] == uni[0] and x[1] == 'http://dbpedia.org/ontology/arwuW']
        return int(rank_uni[0])
    elif treatment_path == 'http://dbpedia.org/ontology/countryName':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        country_uni = [x[0] for x in X if x[2] == uni[0] and x[1] == 'http://dbpedia.org/ontology/isCountryOf']
        return country_uni[0]
    elif treatment_path == 'http://dbpedia.org/ontology/endowment':
        uni = [x[0] for x in X if x[2] == instance and x[1] == 'http://dbpedia.org/ontology/hasForStudent']
        endowment_uni = [x[2] for x in X if x[0] == uni[0] and x[1] == 'http://dbpedia.org/ontology/endowment']
        try:
            return get_endowment(endowment_uni[0])
        except:
            return 0

In [46]:
dic_treatment_path = {
    'birthDate':'http://dbpedia.org/ontology/birthDate',
    'gender':'http://xmlns.com/foaf/0.1/gender',
    'genre':'http://dbpedia.org/ontology/genre',
    'arwuW':'http://dbpedia.org/ontology/arwuW',
    'countryName':'http://dbpedia.org/ontology/countryName',
    'endowment':'http://dbpedia.org/ontology/endowment'
}

### 5.2 : Defining metrics : numerical and categorical

In [47]:
def get_numerical_metric(set_pairs,X,treatment_path):
    m_m,m_e,m_l = 0,0,0
    e_d,e_e = 0,0
    
    for pair in set_pairs:
        value_t0 = get_treatment_DBPedia(pair[0],X,treatment_path)
        value_t1 = get_treatment_DBPedia(pair[1],X,treatment_path)
        
        if value_t0 > 0 and value_t1 > 0:
            value_o0 = get_outcome_DBPedia(pair[0],X)
            value_o1 = get_outcome_DBPedia(pair[1],X)

            if value_t0 > value_t1:
                if value_o0 > value_o1:
                    m_m += 1
                elif value_o0 == value_o1:
                    m_e += 1
                else:
                    m_l += 1

            elif value_t0 == value_t1:
                if value_o0 > value_o1 or value_o0 < value_o1:
                    e_d += 1
                else:
                    e_e += 1

            else:
                if value_o0 > value_o1:
                    m_l += 1
                elif value_o0 == value_o1:
                    m_e += 1
                else:
                    m_m += 1
    
    if m_l > 0:
        return m_m,m_e,m_l,e_d+e_e,m_m/m_l
    else:
        return m_m,m_e,m_l,e_d+e_e,0

In [48]:
def get_categorical_metric(set_pairs,X,treatment_path,t0,t1):
    T_m,T_e,T_l = 0,0,0
    notT_d,notT_e = 0,0
    
    for pair in set_pairs:
        value_t0 = get_treatment_DBPedia(pair[0],X,treatment_path)
        value_t1 = get_treatment_DBPedia(pair[1],X,treatment_path)
        
        value_o0 = get_outcome_DBPedia(pair[0],X)
        value_o1 = get_outcome_DBPedia(pair[1],X)
        
        if value_t0 == t0 and value_t1 == t1:
            if value_o0 > value_o1:
                T_m += 1
            elif value_o0 == value_o1:
                T_e += 1
            else:
                T_l += 1
                
                
        elif value_t0 == t1 and value_t1 == t0:
            if value_o0 > value_o1:
                T_l += 1
            elif value_o0 == value_o1:
                T_e += 1
            else:
                T_m += 1
                
                
        else:
            if value_o0 > value_o1 or value_o0 < value_o1:
                notT_d += 1
            else:
                notT_e += 1
    
    if T_l > 0:
        return T_m,T_e,T_l,notT_d+notT_e,T_m/T_l
    else:
        return T_m,T_e,T_l,notT_d+notT_e,0

### 5.3 : Getting treatment effects

In [49]:
columns = ['treatment','#_pairs','type','treatment_values','metric','#_rule','#_not_rule','#_same']
df_saving_rules = pd.DataFrame(columns=columns)

In [50]:
proportion = 0.02
pairs_closer_instances = get_pairs_from_matrix_and_proportion(df_similarity,proportion=proportion)
print(f"{len(pairs_closer_instances)} pairs have been created.")

341 pairs have been created.


#### 5.3.1 : Birth Date

In [51]:
treatment_path = 'http://dbpedia.org/ontology/birthDate'
m_m,m_e,m_l,e_,metric = get_numerical_metric(pairs_closer_instances,X,treatment_path)

In [52]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'gradual'
dic_rule['treatment_values'] = 'higher_implies_higher'
dic_rule['metric'] = metric
dic_rule['#_rule'] = m_m
dic_rule['#_not_rule'] = m_l
dic_rule['#_same'] = m_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

#### 5.3.2 : Gender

In [53]:
treatment_path = 'http://xmlns.com/foaf/0.1/gender'
t0 = 'male'
t1 = 'female'
T_m,T_e,T_l,not_T,metric = get_categorical_metric(pairs_closer_instances,X,treatment_path,t0,t1)

In [54]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'categorical'
dic_rule['treatment_values'] = t0 + ' vs ' + t1
dic_rule['metric'] = metric
dic_rule['#_rule'] = T_m
dic_rule['#_not_rule'] = T_l
dic_rule['#_same'] = T_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

#### 5.3.3 : Genre

In [55]:
treatment_path = 'http://dbpedia.org/ontology/genre'
t0 = 'Fiction'
t1 = 'NonFiction'
T_m,T_e,T_l,not_T,metric = get_categorical_metric(pairs_closer_instances,X,treatment_path,t0,t1)

In [56]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'categorical'
dic_rule['treatment_values'] = t0 + ' vs ' + t1
dic_rule['metric'] = metric
dic_rule['#_rule'] = T_m
dic_rule['#_not_rule'] = T_l
dic_rule['#_same'] = T_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

#### 5.3.4 : arwuW

In [57]:
treatment_path = 'http://dbpedia.org/ontology/arwuW'
m_m,m_e,m_l,e_,metric = get_numerical_metric(pairs_closer_instances,X,treatment_path)

In [58]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'gradual'
dic_rule['treatment_values'] = 'higher_implies_higher'
dic_rule['metric'] = metric
dic_rule['#_rule'] = m_m
dic_rule['#_not_rule'] = m_l
dic_rule['#_same'] = m_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

#### 5.3.5 : countryName

In [59]:
list(np.unique([x[2] for x in X if x[1]=='http://dbpedia.org/ontology/countryName']))

['Asia', 'Canada', 'England', 'Nan', 'U.S.']

In [60]:
treatment_path = 'http://dbpedia.org/ontology/countryName'
t0 = 'England'
t1 = 'U.S.'
T_m,T_e,T_l,not_T,metric = get_categorical_metric(pairs_closer_instances,X,treatment_path,t0,t1)

In [61]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'categorical'
dic_rule['treatment_values'] = t0 + ' vs ' + t1
dic_rule['metric'] = metric
dic_rule['#_rule'] = T_m
dic_rule['#_not_rule'] = T_l
dic_rule['#_same'] = T_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

In [None]:
t0 = 'U.S.'
t1 = 'Canada'
T_m,T_e,T_l,not_T,metric = get_categorical_metric(pairs_closer_instances,X,treatment_path,t0,t1)

In [None]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'categorical'
dic_rule['treatment_values'] = t0 + ' vs ' + t1
dic_rule['metric'] = metric
dic_rule['#_rule'] = T_m
dic_rule['#_not_rule'] = T_l
dic_rule['#_same'] = T_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

#### 5.3.6 : endowment

In [None]:
treatment_path = 'http://dbpedia.org/ontology/endowment'
m_m,m_e,m_l,e_,metric = get_numerical_metric(pairs_closer_instances,X,treatment_path)

In [None]:
dic_rule = {}
dic_rule['treatment'] = treatment_path.split('/')[-1]
dic_rule['#_pairs'] = len(pairs_closer_instances)
dic_rule['type'] = 'gradual'
dic_rule['treatment_values'] = 'higher_implies_higher'
dic_rule['metric'] = metric 
dic_rule['#_rule'] = m_m
dic_rule['#_not_rule'] = m_l
dic_rule['#_same'] = m_e

df_saving_rules = df_saving_rules.append(dic_rule,ignore_index=True)

In [None]:
df_saving_rules

### 5.4 : Adding confidence interval

In [None]:
def compute_interval(value_rule,value_not_rule,stat_param=1.96):
    if value_rule > 0 and value_not_rule > 0:
        metric = value_rule/value_not_rule
        log_m = math.log(metric)
        interval_amp = stat_param*math.sqrt((1/value_rule)+(1/value_not_rule))
        return round(metric,3), [round(math.exp(log_m - interval_amp),3),round(math.exp(log_m + interval_amp),3)]
    else:
        return None,None

In [None]:
list_metric_low_value = []
list_metric_high_value = []
stat_param = 1.28

for index, row in df_saving_rules.iterrows():
    metric_,metric_IC = compute_interval(row['#_rule'],row['#_not_rule'],stat_param=stat_param)
    if metric_:
        list_metric_low_value.append(metric_IC[0])
        list_metric_high_value.append(metric_IC[1])
    else:
        list_metric_low_value.append(1)
        list_metric_high_value.append(1)
        
df_saving_rules['metric_IC_low'] = list_metric_low_value
df_saving_rules['metric_IC_high'] = list_metric_high_value

In [None]:
df_saving_rules

In [None]:
rules_treatment_higher = df_saving_rules[df_saving_rules['metric_IC_low']>1]

In [None]:
df_select = []
for treatment in list(np.unique(rules_treatment_higher['treatment'])):
    df_ = rules_treatment_higher[rules_treatment_higher['treatment']==treatment]
    df_ = df_.sort_values(["#_pairs"],ascending=True)
    df_select.append(df_[0:1][:])
    
df_rule_higher = pd.concat(df_select)

In [None]:
rules_treatment_lower = df_saving_rules[df_saving_rules['metric_IC_high']<1]

In [None]:
df_select = []
for treatment in list(np.unique(rules_treatment_lower['treatment'])):
    df_ = rules_treatment_lower[rules_treatment_lower['treatment']==treatment]
    df_ = df_.sort_values(["#_pairs"],ascending=True)
    df_select.append(df_[0:1][:])
    
df_rule_lower = pd.concat(df_select)

In [None]:
df_rule_higher

In [None]:
df_rule_lower

### 5.5 : Number of Pairs Explained

In [None]:
def get_if_explained(pair,df_rule_high,df_rule_low,dic_treatment_path):
    outcome_i0 = get_outcome_DBPedia(pair[0],X)
    outcome_i1 = get_outcome_DBPedia(pair[1],X)
    
    for item, row in df_rule_high.iterrows():
        treatment_path = dic_treatment_path[row['treatment']]
        value_t0 = get_treatment_DBPedia(pair[0],X,treatment_path)
        value_t1 = get_treatment_DBPedia(pair[1],X,treatment_path)
        
        if row['type'] == 'gradual':
            if outcome_i0 > outcome_i1 and value_t0 > value_t1:
                return 1
            elif outcome_i0 < outcome_i1 and value_t0 < value_t1:
                return 1
    
    for item, row in df_rule_low.iterrows():
        treatment_path = dic_treatment_path[row['treatment']]
        value_t0 = get_treatment_DBPedia(pair[0],X,treatment_path)
        value_t1 = get_treatment_DBPedia(pair[1],X,treatment_path)
        
        if row['type'] == 'gradual':
            if outcome_i0 > outcome_i1 and value_t0 < value_t1:
                return 1
            elif outcome_i0 < outcome_i1 and value_t0 > value_t1:
                return 1
            
        else:
            t0_rule = row['treatment_values'].split(' vs ')[0]
            t1_rule = row['treatment_values'].split(' vs ')[1]
            
            if value_t0 == t0_rule and value_t1 == t1_rule and outcome_i0 < outcome_i1:
                return 1
            elif value_t0 == t1_rule and value_t1 == t0_rule and outcome_i0 > outcome_i1:
                return 1
        
    return 0

In [None]:
list_pairs = list(itertools.combinations(list_target_class_instances,2))

In [None]:
sampling_parameter = 1000
sample_pairs = random.sample(list_pairs,sampling_parameter)

In [None]:
explained = 0
for pair in sample_pairs:
    explained += get_if_explained(pair,df_rule_higher,df_rule_lower,dic_treatment_path)

In [None]:
print("Percentage of pairs explained : ",round(explained*100/len(sample_pairs),3))

Note :
- si trop peu d'instances : on aura que très peu de règles car IC large
- si trop d'instances : il y aura des règles mais avec des paires peu similaires

## 6 : Comparison to K-CAP

In [None]:
dic_treatment_path = {
    'birthDate':'http://dbpedia.org/ontology/birthDate',
    'gender':'http://xmlns.com/foaf/0.1/gender',
    'genre':'http://dbpedia.org/ontology/genre',
    'arwuW':'http://dbpedia.org/ontology/arwuW',
    'countryName':'http://dbpedia.org/ontology/countryName',
    'endowment':'http://dbpedia.org/ontology/endowment'
}

### 6.1 : Defining the set of K-CAP rules

In [None]:
columns = ['treatment','treatment_type','t0_value','t1_value','strata']
df_KCAP_rules = pd.DataFrame(columns=columns)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'arwuW'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_older'
dic_rule['t1_value'] = 'lower_publish_younger'
dic_rule['strata'] = {
    'genre':['Fiction'],
    'gender':['male'],
    'birthDate':(1800,1934),
    'countryName':['England','U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'arwuW'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_older'
dic_rule['t1_value'] = 'lower_publish_younger'
dic_rule['strata'] = {
    'genre':['Fiction'],
    'gender':['male'],
    'birthDate':(1800,2000),
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'arwuW'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_older'
dic_rule['t1_value'] = 'lower_publish_younger'
dic_rule['strata'] = {
    'genre':['Fiction'],
    'birthDate':(1935,1959),
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'arwuW'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_younger'
dic_rule['t1_value'] = 'lower_publish_older'
dic_rule['strata'] = {
    'genre':['NonFiction'],
    'gender':['male'],
    'birthDate':(1935,1959),
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'birthDate'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_older'
dic_rule['t1_value'] = 'lower_publish_younger'
dic_rule['strata'] = {
    'arwuW':(1,100),
    'genre':['Fiction'],
    'gender':['male'],
    'countryName':['England']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'birthDate'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_younger'
dic_rule['t1_value'] = 'lower_publish_older'
dic_rule['strata'] = {
    'arwuW':(1,100),
    'genre':['Fiction'],
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'birthDate'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_younger'
dic_rule['t1_value'] = 'lower_publish_older'
dic_rule['strata'] = {
    'arwuW':(1,100),
    'gender':['male'],
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'birthDate'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_younger'
dic_rule['t1_value'] = 'lower_publish_older'
dic_rule['strata'] = {
    'genre':['Fiction'],
    'countryName':['U.S.']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'birthDate'
dic_rule['treatment_type'] = 'gradual'
dic_rule['t0_value'] = 'higher_publish_younger'
dic_rule['t1_value'] = 'lower_publish_older'
dic_rule['strata'] = {
    'arwuW':(101,600),
    'genre':['Fiction'],
    'gender':['male'],
    'countryName':['U.S.','Canada','England']
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'genre'
dic_rule['treatment_type'] = 'categorial'
dic_rule['t0_value'] = 'Fiction'
dic_rule['t1_value'] = 'NonFiction'
dic_rule['strata'] = {
    'arwuW':(1,100),
    'gender':['male'],
    'countryName':['U.S.'],
    'birthDate':[(1800,1934),(1960,2000)]
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'gender'
dic_rule['treatment_type'] = 'categorial'
dic_rule['t0_value'] = 'male'
dic_rule['t1_value'] = 'female'
dic_rule['strata'] = {
    'arwuW':(1,100),
    'genre':['Fiction'],
    'countryName':['U.S.'],
    'birthDate':(1800,2000)
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'gender'
dic_rule['treatment_type'] = 'categorial'
dic_rule['t0_value'] = 'male'
dic_rule['t1_value'] = 'female'
dic_rule['strata'] = {
    'genre':['Fiction'],
    'countryName':['U.S.'],
    'birthDate':(1935, 1959)
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
dic_rule = {}
dic_rule['treatment'] = 'gender'
dic_rule['treatment_type'] = 'categorial'
dic_rule['t0_value'] = 'female'
dic_rule['t1_value'] = 'male'
dic_rule['strata'] = {
    'arwuW':(101,600),
    'genre':['Fiction'],
    'countryName':['U.S.'],
    'birthDate':[(1800,1934),(1960,2000)]
}
df_KCAP_rules = df_KCAP_rules.append(dic_rule,ignore_index=True)

In [None]:
df_KCAP_rules.to_csv('KCAP_Rules.csv',index=False)

In [None]:
def get_if_treatment_outcome_pair_in_rule(row_df,pair_,X,dic_treatment_path):
    outcome_0 = get_outcome_DBPedia(pair_[0],X)
    outcome_1 = get_outcome_DBPedia(pair_[1],X)
    
    treatment_name = row_df['treatment']
    treatment_value_0 = get_treatment_DBPedia(pair_[0],X,dic_treatment_path[treatment_name])
    treatment_value_1 = get_treatment_DBPedia(pair_[1],X,dic_treatment_path[treatment_name])
    
    if row_df['treatment_type'] == 'categorical':
        if treatment_value_0 == row_df['t0_value'] and treatment_value_1 == row_df['t1_value']:
            if outcome_0 < outcome_1:
                return True
            else:
                return False
        elif treatment_value_0 == row_df['t1_value'] and treatment_value_1 == row_df['t0_value']:
            if outcome_1 < outcome_0:
                return True
            else:
                return False
        else:
            return False
        
    else:
        if row_df['t0_value'] == 'higher_publish_older':
            if treatment_value_0 > treatment_value_1 and outcome_0 > outcome_1:
                return True
            elif treatment_value_0 < treatment_value_1 and outcome_0 < outcome_1:
                return True
            else:
                return False
        else:
            if treatment_value_0 > treatment_value_1 and outcome_0 < outcome_1:
                return True
            elif treatment_value_0 < treatment_value_1 and outcome_0 > outcome_1:
                return True
            else:
                return False

In [None]:
def get_if_pair_in_strata(row_df,pair_,X,dic_treatment_path):
    for prop_name,strata_value in row_df['strata'].items():
        
        value_0 = get_treatment_DBPedia(pair_[0],X,dic_treatment_path[prop_name])
        value_1 = get_treatment_DBPedia(pair_[1],X,dic_treatment_path[prop_name])
        
        if prop_name == 'countryName':
            value_0 = value_0.split('/')[-1]
            value_1 = value_1.split('/')[-1]
        
        if prop_name not in ['birthDate','arwuW']:
            if value_0 not in strata_value or value_1 not in strata_value:
                return False
        elif prop_name == 'arwuW':
            if value_0 < strata_value[0] or value_0 > strata_value[1] or value_1 < strata_value[0] or value_1 > strata_value[1]:
                return False
        elif prop_name == 'birthDate':
            if type(strata_value)!=list:
                if value_0 < strata_value[0] or value_0 > strata_value[1] or value_1 < strata_value[0] or value_1 > strata_value[1]:
                    return False
            else:
                is_in_one = False
                for b_interval in strata_value:
                    if value_0 >= b_interval[0] and value_0 <= b_interval[1] and value_1 >= b_interval[0] and value_1 <= b_interval[1]:
                        is_in_one = True
                if not is_in_one:
                    return False
    return True

In [None]:
def get_if_pair_explained_kCAP(pair_,df_,X,dic_treatment_path):
    for index, row in df_.iterrows():
        rule_explains = get_if_treatment_outcome_pair_in_rule(row,pair_,X,dic_treatment_path)
        if rule_explains:
            pair_in_strata = get_if_pair_in_strata(row,pair_,X,dic_treatment_path)
            if pair_in_strata:
                return 1
    return 0

In [None]:
number_test = 3000
number_explained = 0
for pair_ in random.sample(list_pairs,number_test):
    if get_if_pair_explained_kCAP(pair_,df_KCAP_rules,X,dic_treatment_path):
        number_explained += 1
print(number_explained/number_test)

Dernier point : robustesse des données

Comment varient les règles si on enlève des données pendant l'entrainement ?
- facile pour les embeddings
- reprendre notebook pour KCAP