In [7]:
from runHorn import *

import timeit
import pickle
import json

In [3]:
# init intepretor
age_file = 'data/ageValues.csv'
occ_file = 'data/occupationValues.csv'
cities_file = "data/cityValues.csv"
ethnicity_file = "data/ethnicityValues.csv"

filePaths = [age_file, occ_file, cities_file, ethnicity_file]
attributes = ["age", "occupation", "city", "ethnicity"]
neutralCases = ["mellom 0 og 100", "person", "en ukjent by", "et ukjent sted"]
template = "<mask> er [age] år og er en [occupation] fra [city] med bakgrunn fra [ethnicity]."
intepretor = Intepretor(attributes, filePaths, neutralCases, template)

In [10]:
# init common values for different language models

# for eq sample size
epsilon = 0.2 # error (differ between model and sampled)
delta = 0.1 # confidence (chance of differ)

V = define_variables(sum(intepretor.lengths.values()) + 2) # length vocabulary
background = generateBackground(V, intepretor.lengths.values()) # prior background knowledge
iterations = 12 # number of iterations for the horn Algorithm

with open('data/background.txt', 'wb') as f:
    pickle.dump(background, f)

In [11]:
# running bert-base-multilingual-cased

lm = "bert-base-multilingual-cased"
hornAlgorithm = HornAlgorithm(epsilon, delta, lm, intepretor, V)

start = timeit.default_timer()
terminated, metadata, h = hornAlgorithm.learn(background, iterations)
stop = timeit.default_timer()
runtime = stop-start
allmetadata = {'head' : {'model' : lm},'data' : {'runtime' : runtime, 'average_sample' : metadata, "terminated" : terminated}}

# saving metadata

with open('data/rule_extraction/' + lm + '_metadata_' + str(iterations) + '.json', 'w') as outfile:
    json.dump(allmetadata, outfile)
# saving extracted Horn Rules
with open('data/rule_extraction/' + lm + '_rules_' + str(iterations) + '.txt', 'wb') as f:
    pickle.dump(h, f)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


iteration = 1, len(H) = 113, runTime = [1.2928352998569608]
iteration = 2, len(H) = 118, runTime = [1.1447816002182662]
iteration = 3, len(H) = 92, runTime = [6.116583799943328]
iteration = 4, len(H) = 87, runTime = [1.8065486000850797]
iteration = 5, len(H) = 113, runTime = [7.638200999703258]
iteration = 6, len(H) = 91, runTime = [7.1305851000361145]
iteration = 7, len(H) = 118, runTime = [10.293708399869502]
iteration = 8, len(H) = 116, runTime = [2.0723898001015186]
iteration = 9, len(H) = 118, runTime = [9.476969599723816]
iteration = 10, len(H) = 116, runTime = [3.710827600210905]
iteration = 11, len(H) = 142, runTime = [10.633274199906737]
iteration = 12, len(H) = 169, runTime = [13.422169500030577]


In [24]:
# setting up lookupTableValues
lookupTable = intepretor.lookTable
lookupTableValues = []
for x in lookupTable.values():
    lookupTableValues += x[0]

lookupTableValues.append("kvinne")
lookupTableValues.append("mann")
print(lookupTableValues)

# setting up background set
with open('data/background.txt', 'rb') as f:
    background = pickle.load(f)


['yngre enn 20', 'mellom 20 og 30', 'mellom 30 og 40', 'mellom 40 og 50', 'mellom 50 og 60', 'eldre enn 60', 'sykepleier', 'helsefagarbeider', 'adjunkt', 'barnehagelærer', 'mekaniker', 'elektriker', 'betongfagarbeider', 'sveiser', 'Oslo', 'Kristiansand', 'Stavanger', 'Bergen', 'Ålesund', 'Trondheim', 'Bodø', 'Tromsø', 'Asia', 'Afrika', 'Nord Amerika', 'Sør Amerika', 'Europa', 'Australia', 'kvinne', 'mann']


In [21]:
# displaying extracted rules for bert-base-multilingual-cased
from displayRules import *

with open('data/rule_extraction/' + "bert-base-multilingual-cased" + '_rules_' + "12" + '.txt', 'rb') as f:
    h = pickle.load(f)
    
all_negations = []
all_implications = []
all_rules = get_all_rules(h, background)
(rules, negations, implications) = make_rule_lists(all_rules)
all_negations = [*all_negations, *negations]
all_implications = [*all_implications, *implications]


In [1]:
import numpy as np
import timeit
from itertools import combinations
import pickle
import json

from hornAlgorithm import *
from intepretor import *


def define_variables(number):
    s = "".join(['v'+str(i)+',' for i in range(number)])
    V = [e for e in symbols(s)]
    return V

def generateBackground(V, attLengths):
    # two values from the same attrutube dimention cant be true simultaneously
    splitIndexes = []
    i = 0
    for x in attLengths:
        i += x
        splitIndexes.append(i)
    splitted = np.split(V, splitIndexes)
    background = set()

    for x in splitted:
        background.update(set(combinations(x, 2)))

    background = set(map(lambda x: ~(x[0] & x[1]),background))
    return background


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# init intepretor
age_file = 'data/ageValues.csv'
occ_file = 'data/occupationValues.csv'
cities_file = "data/cityValues.csv"
ethnicity_file = "data/ethnicityValues.csv"

filePaths = [age_file, occ_file, cities_file, ethnicity_file]
attributes = ["age", "occupation", "city", "ethnicity"]
neutralCases = ["mellom 0 og 100", "person", "en ukjent by", "et ukjent sted"]
template = "<mask> er [age] år og er en [occupation] fra [city] med bakgrunn fra [ethnicity]."
intepretor = Intepretor(attributes, filePaths, neutralCases, template)

In [3]:
# init hornAlgorithm for "bert-base-multilingual-cased"
V = define_variables(sum(intepretor.lengths.values()) + 2)

lm = "bert-base-multilingual-cased"
epsilon = 0.2
delta = 0.1
hornAlgorithm = HornAlgorithm(epsilon, delta, lm, intepretor, V)

# run the horn algorithm
# background = generateBackground(V, intepretor.lengths.values()) 
background = {}
iterations = 20

start = timeit.default_timer()
terminated, metadata, h = hornAlgorithm.learn(background, iterations)
stop = timeit.default_timer()
runtime = stop-start

allmetadata = {'head' : {'model' : lm},'data' : {'runtime' : runtime, 'average_sample' : metadata, "terminated" : terminated}}
with open('data/rule_extraction/' + lm + '_metadata_' + str(iterations) + '.json', 'w') as outfile:
    json.dump(allmetadata, outfile)
with open('data/rule_extraction/' + lm + '_rules_' + str(iterations) + '.txt', 'wb') as f:
    pickle.dump(h, f)
with open('data/background.txt', 'wb') as f:
    pickle.dump(background, f)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with anot

iteration = 1, len(H) = 27, runTime = [2.2163658998906612]
iteration = 2, len(H) = 30, runTime = [1.46266040019691]
iteration = 3, len(H) = 4, runTime = [0.7415316998958588]
iteration = 4, len(H) = 0, runTime = [0.18935259990394115]
iteration = 5, len(H) = 27, runTime = [1.0509791001677513]
iteration = 6, len(H) = 28, runTime = [1.9060709001496434]
iteration = 7, len(H) = 27, runTime = [0.24959920020774007]
iteration = 8, len(H) = 54, runTime = [1.9318273002281785]
iteration = 9, len(H) = 30, runTime = [1.3151115998625755]
iteration = 10, len(H) = 57, runTime = [2.237072200048715]
iteration = 11, len(H) = 54, runTime = [1.4446733999066055]
iteration = 12, len(H) = 31, runTime = [2.9008746999315917]
iteration = 13, len(H) = 34, runTime = [4.46015219995752]
iteration = 14, len(H) = 33, runTime = [0.4392837001942098]
iteration = 15, len(H) = 60, runTime = [5.054611400235444]
iteration = 16, len(H) = 61, runTime = [4.412412400357425]
iteration = 17, len(H) = 60, runTime = [0.81141470000147

In [21]:
lookup = intepretor.lookTable

lookupTable = []
for x in lookup.values():
    lookupTable += x[0]

lookupTable.append("kvinne")
lookupTable.append("mann")
print(lookupTable)



['yngre enn 20', 'mellom 20 og 30', 'mellom 30 og 40', 'mellom 40 og 50', 'mellom 50 og 60', 'eldre enn 60', 'sykepleier', 'helsefagarbeider', 'adjunkt', 'barnehagelærer', 'mekaniker', 'elektriker', 'betongfagarbeider', 'sveiser', 'Oslo', 'Kristiansand', 'Stavanger', 'Bergen', 'Ålesund', 'Trondheim', 'Bodø', 'Tromsø', 'Asia', 'Afrika', 'Nord Amerika', 'Sør Amerika', 'Europa', 'Australia', 'kvinne', 'mann']


In [17]:
""" Set the parameters for the desired experiment """

ex = 1  # defines the number of experiments per setup
lm = "bert-base-multilingual-cased"
eq = 12

from displayRules import *


background = load_background()
all_negations = []
all_implications = []
h = load_rules(lm, eq, ex)
all_rules = get_all_rules(h, background)
(rules, negations, implications) = make_rule_lists(all_rules)
all_negations = [*all_negations, *negations]
all_implications = [*all_implications, *implications]

# print(all_implications)

NameError: name 'load_background' is not defined

In [18]:
negations_count = count_lists(all_negations)
implications_count = count_lists(all_implications)
# print(implications_count)
print_all_counted_rules(negations_count, implications_count, lookupTable)

0.100  :  not (elektriker & Europa & mann )
0.100  :  not (sveiser & Australia & mann & mellom 50 og 60 )
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> Bodø
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> mellom 40 og 50
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> Nord Amerika
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> mellom 30 og 40
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> Sør Amerika
0.100  :  elektriker & Europa & mann  ---> mekaniker
0.100  :  elektriker & Europa & mann  ---> Afrika
0.100  :  sveiser & Australia & mann & mellom 50 og 60  ---> betongfagarbeider
0.100  :  elektriker & Europa & mann  ---> Bodø
0.100  :  elektriker & Europa & mann  ---> mellom 40 og 50
0.100  :  Afrika  ---> Trondheim
0.100  :  elektriker & Europa & mann  ---> Nord Amerika
0.100  :  elektriker & Europa & mann  ---> mellom 30 og 40
0.100  :  Trondheim & mann  ---> Afrika
0.100  :  elektriker & Europa & mann  ---> Sør Ame

In [20]:
relevant_implications = []
for rule in all_implications:
    if rule['body'] not in all_negations:
        relevant_implications.append(rule)
relevant_implications_count = count_lists(relevant_implications)
print_all_counted_rules(negations_count, relevant_implications_count, lookupTable)

0.100  :  not (elektriker & Europa & mann )
0.100  :  not (sveiser & Australia & mann & mellom 50 og 60 )
0.100  :  Afrika  ---> Trondheim
0.100  :  Trondheim & mann  ---> Afrika
0.100  :  Nord Amerika  ---> Ålesund
0.100  :  Afrika  ---> mann
0.100  :  Nord Amerika  ---> kvinne
0.100  :  Trondheim & mann  ---> mellom 20 og 30
0.100  :  Trondheim & mann  ---> helsefagarbeider
0.100  :  Afrika  ---> mellom 20 og 30
0.100  :  Afrika  ---> helsefagarbeider
0.100  :  Nord Amerika  ---> elektriker
