In [None]:
!git clone https://github.com/HLR/DomiKnowS.git
%cd DomiKnowS
!git checkout origin/Tasks
!pip install DomiKnowS

import logging
logging.basicConfig(level=logging.INFO)

import __main__
__main__.__file__="beliefbank.py"

%cd beliefe_bank
!pip install transformers

Importing specific classes and functions from various libraries

In [8]:
from domiknows.program.lossprogram import SampleLossProgram
import torch,argparse,sys
from transformers import AdamW
from domiknows.program.loss import NBCrossEntropyLoss, BCEWithLogitsIMLoss
from domiknows.program.metric import MacroAverageTracker, PRF1Tracker, MetricTracker, CMWithLogitsMetric, DatanodeCMMetric
import logging
from reader import read_data
from domiknows.graph import Graph, Concept, Relation, ifL, andL, notL, existsL
from domiknows.program.lossprogram import PrimalDualProgram
from domiknows.sensor.pytorch import ModuleLearner
from domiknows.sensor.pytorch.relation_sensors import CompositionCandidateSensor
from domiknows.sensor.pytorch.sensors import ReaderSensor, JointSensor, FunctionalSensor
from utils import Generator, make_facts, label_reader, RobertaTokenizer, BBRobert,SimpleTokenizer
from domiknows.program import SolverPOIProgram, IMLProgram
from domiknows.program.model.pytorch import SolverModel, IMLModel

Configuring the parser to take command line arguments for various parameters


In [19]:
class Args:
    def __init__(self):
        self.cuda_number = 0
        self.cur_epoch = 5
        self.samplenum = 15
        self.simple_model = True
        self.primaldual = False
        self.IML = False
        self.SAM = False
        self.batch_size = 64
        self.beta = 0.1
        self.learning_rate = 2e-4

args = Args()

# Setting logging level to INFO
logging.basicConfig(level=logging.INFO)

Loading the data and partitioning it into training and validation sets


In [20]:
calibration_data,silver_data,constraints_yes,constraints_no=read_data(batch_size=args.batch_size,sample_size=args.samplenum)
train_size=len(calibration_data)*3//4
calibration_data_dev=calibration_data[train_size:]
calibration_data=calibration_data[:train_size]
cuda_number= args.cuda_number
device = "cuda:"+str(cuda_number) if torch.cuda.is_available() else 'cpu'
print("device is : ",device)

number of links: 4060
data sizes: 16 254 1846 1846
device is :  cpu


Defining helper functions guess_pair_yes and guess_pair_no to check if sentence pairs satisfy constraints


In [21]:
def guess_pair_yes(sentence, arg1, arg2):

    if len(sentence)<2 or arg1==arg2:
        return False
    sentence1, sentence2 = arg1.getAttribute('sentence'), arg2.getAttribute('sentence')
    if sentence1 in constraints_yes and sentence2 in constraints_yes[sentence1]:
        return True
    else:
        return False

def guess_pair_no(sentence, narg1, narg2):

    if len(sentence)<2 or narg1==narg2:
        return False
    sentence1, sentence2 = narg1.getAttribute('sentence'), narg2.getAttribute('sentence')
    if sentence1 in constraints_no and sentence2 in constraints_no[sentence1]:
        return True
    else:
        return False

Clearing the existing graphs, concepts, and relations

In [22]:
Graph.clear()
Concept.clear()
Relation.clear()

Constructing a graph to model the problem

In [23]:
with Graph('belief_bank') as graph:
    subject = Concept(name='subject')
    facts = Concept(name='facts')
    subject_facts_contains, = subject.contains(facts)

    fact_check = facts(name='fact_check')
    implication = Concept(name='implication')
    i_arg1, i_arg2 = implication.has_a(arg1=facts, arg2=facts)

    nimplication = Concept(name='nimplication')
    ni_arg1, ni_arg2 = nimplication.has_a(narg1=facts, narg2=facts)

    ifL(andL(fact_check('x'), existsL(implication('s', path=('x', implication)))), fact_check(path=('s', i_arg2)))
    #ifL(implication('s'), ifL(fact_check(path=('s',i_arg1.reversed)),fact_check(path=('s',i_arg2.reversed )) ) )
    #ifL(andL(implication('s'),fact_check(path=('s',i_arg1.reversed)) ,fact_check(path=('s',i_arg2.reversed )) ) )
    ifL(andL(fact_check('x'), existsL(nimplication('s', path=('x', nimplication)))), notL(fact_check(path=('s', ni_arg2))))

  i_arg1, i_arg2 = implication.has_a(arg1=facts, arg2=facts)
  ni_arg1, ni_arg2 = nimplication.has_a(narg1=facts, narg2=facts)


Configuring the sensors to process the data and make inferences

In [24]:
subject['name'] = ReaderSensor(keyword='name')
subject['facts'] = ReaderSensor(keyword='facts')
subject['labels'] = ReaderSensor(keyword='labels')

facts[subject_facts_contains,"name", "sentence", 'label'] = JointSensor(\
    subject['name'], subject['facts'], subject['labels'],forward=make_facts,device=device)
facts[fact_check] = FunctionalSensor(subject_facts_contains, "label", forward=label_reader, label=True,device=device)

implication[i_arg1.reversed, i_arg2.reversed] = CompositionCandidateSensor(facts['sentence'],relations=(i_arg1.reversed, i_arg2.reversed),forward=guess_pair_yes,device=device)
nimplication[ni_arg1.reversed, ni_arg2.reversed] = CompositionCandidateSensor(facts['sentence'],relations=(ni_arg1.reversed, ni_arg2.reversed),forward=guess_pair_no,device=device)


Configuring the learning model

In [25]:
if not args.simple_model:
    facts["token_ids", "Mask"] = JointSensor("name", "sentence", forward=RobertaTokenizer(),device=device)
    facts[fact_check] = ModuleLearner("token_ids", "Mask", module=BBRobert(),device=device)
else:
    facts["emb"] = JointSensor("name", "sentence", forward=SimpleTokenizer(device),device=device)
    facts[fact_check] = ModuleLearner("emb", module=torch.nn.Linear(96, 2),device=device)

f=open("output_save.txt","w")
if not args.primaldual and not args.IML and not args.SAM:
    program = SolverPOIProgram(graph, poi=[facts[fact_check],implication,nimplication],inferTypes=['ILP','local/argmax'],\
                    loss=MacroAverageTracker(NBCrossEntropyLoss()),metric={'ILP': PRF1Tracker(DatanodeCMMetric()),\
                                                'softmax': PRF1Tracker(DatanodeCMMetric('local/argmax'))},f=f)
elif args.primaldual:
    program = PrimalDualProgram(graph,SolverModel, poi=[facts[fact_check],implication,nimplication],inferTypes=['ILP','local/argmax'],\
                    loss=MacroAverageTracker(NBCrossEntropyLoss()),metric={'ILP': PRF1Tracker(DatanodeCMMetric()),\
                                               'softmax': PRF1Tracker(DatanodeCMMetric('local/argmax'))},beta=args.beta,device=device,f=f)
elif args.IML:
    program = IMLProgram(graph, poi=[facts[fact_check],implication,nimplication],inferTypes=['ILP','local/argmax'],\
                   loss=MacroAverageTracker(BCEWithLogitsIMLoss(lmbd=args.beta)),metric={'ILP': PRF1Tracker(DatanodeCMMetric()),\
                                               'softmax': PRF1Tracker(DatanodeCMMetric('local/argmax'))})
elif args.SAM:
    program = SampleLossProgram(graph, SolverModel,poi=[facts[fact_check],implication,nimplication],inferTypes=['ILP','local/argmax'],
        metric={'argmax': PRF1Tracker(DatanodeCMMetric('local/argmax'))},loss=MacroAverageTracker(NBCrossEntropyLoss()),sample=True,sampleSize=50,sampleGlobalLoss=True,beta=args.beta,device=device)



Training the program

In [27]:
program.train(calibration_data,valid_set=calibration_data_dev, train_epoch_num=args.cur_epoch, Optim=lambda param: AdamW(param, lr = args.learning_rate ,eps = 1e-9 ),device=device)

Epoch 1 Training: 100%|██████████| 12/12 [00:06<00:00,  1.89it/s]
Epoch 1 Validation: 100%|██████████| 4/4 [00:02<00:00,  1.43it/s]
Epoch 2 Training: 100%|██████████| 12/12 [00:09<00:00,  1.26it/s]
Epoch 2 Validation: 100%|██████████| 4/4 [00:01<00:00,  2.00it/s]
Epoch 3 Training: 100%|██████████| 12/12 [00:07<00:00,  1.64it/s]
Epoch 3 Validation: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]
Epoch 4 Training: 100%|██████████| 12/12 [00:06<00:00,  1.72it/s]
Epoch 4 Validation: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]
Epoch 5 Training: 100%|██████████| 12/12 [00:08<00:00,  1.49it/s]
Epoch 5 Validation: 100%|██████████| 4/4 [00:02<00:00,  1.97it/s]


Checking the accuracy of the model and constraints using a subset of the silver_data

In [28]:
ac_, t_ = 0, 0
for datanode in program.populate(silver_data[:40], device="cpu"):
    #     tdatanode = datanode.findDatanodes(select = context)[0]
    #     print(len(datanode.findDatanodes(select = context)))
    #     print(tdatanode.getChildDataNodes(conceptName=step))


    datanode.inferILPResults()
    verifyResult = datanode.verifyResultsLC()
    verifyResultILP = datanode.verifyResultsLC()
    ac_ += sum([verifyResultILP[lc]['satisfied'] for lc in verifyResultILP])
    t_ +=len(verifyResultILP.keys())

print("constraint accuracy: ", ac_ / t_ )

#, c_warmup_iters=0,test_set=silver_data
f.close()
_,silver_data_test,constraints_yes,constraints_no=read_data(batch_size=32*8,sample_size=40)

program.test(silver_data_test[:30], device=device)


40it [00:29,  1.36it/s]


constraint accuracy:  100.0
number of links: 4060
data sizes: 7 85 1846 1846


Testing: 100%|██████████| 30/30 [01:05<00:00,  2.18s/it]
