In [None]:
!git clone https://github.com/HLR/DomiKnowS.git
%cd DomiKnowS
!git checkout origin/Tasks
!pip install DomiKnowS

import logging
logging.basicConfig(level=logging.INFO)

import __main__
__main__.__file__="graph.py"

In [None]:
!python -m spacy download en_core_web_sm
!pip install torchtext==0.9 --no-dependencies

First, build the graph that specifies the domain knowledge for this problem

In [4]:
import logging

logging.basicConfig(level=logging.INFO)
from domiknows.graph import Graph, Concept, Relation
from domiknows.graph.relation import disjoint

Graph.clear()
Concept.clear()
Relation.clear()

with Graph('global') as graph:
  review = Concept(name='review')

  positive = review(name='positive')
  negative = review(name='negative')

  disjoint(positive, negative)

Log file for dataNode is in: /content/DomiKnowS/logs/datanode.log


Define the readers for the labels and text



In [5]:
from domiknows.sensor.pytorch.sensors import ReaderSensor

review['text'] = ReaderSensor(keyword='text')

review[positive] = ReaderSensor(keyword='positive', label=True)
review[negative] = ReaderSensor(keyword='negative', label=True)

Define the model parameters

In [6]:
embed_size = 300
hidden_size = 100
num_classes = 2
drop_rate = 0.5

Define a sensor that converts the raw text to GloVe embedding vectors and connect the sensor to the graph such that it creates the embedding representation of the text based on the raw data.

In [7]:
from domiknows.sensor.pytorch.sensors import FunctionalSensor
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch

class EmbeddingSensor(FunctionalSensor):
  def __init__(self, *pres, **kwarg):
    super().__init__(*pres, **kwarg)

    self.vocab = GloVe(name='840B', dim=embed_size)
    self.tokenizer = get_tokenizer('spacy', language='en')

  def forward(self, *inputs):
    text = inputs[0]

    tokens_batch = [self.tokenizer(text)]

    emb_batch = []
    for tokens in tokens_batch:
      rev_emb = torch.empty((len(tokens), embed_size))
      for i, tok in enumerate(tokens):
        rev_emb[i] = self.vocab[tok]

      emb_batch.append(rev_emb)

    padded = pad_sequence(emb_batch)

    out = padded.to(device=self.device)

    return out

review['text_embed'] = EmbeddingSensor('text')

.vector_cache/glove.840B.300d.zip: 2.18GB [06:50, 5.30MB/s]                            
100%|█████████▉| 2196016/2196017 [05:34<00:00, 6558.57it/s]


Define a pytorch module for the LSTM model to produce RNN representations of the text


In [8]:
from torch import nn

class LSTMModule(nn.Module):
  def __init__(self):
    super(LSTMModule, self).__init__()

    self.rnn = nn.LSTM(embed_size, hidden_size, bidirectional=True)
    self.dropout = nn.Dropout(p=drop_rate)

  def forward(self, input):
    output, (h, c) = self.rnn(input)
    forward, backward = torch.chunk(output, 2, dim=2)
    comb = torch.cat((forward[-1,:,:], backward[0,:,:]), dim=1)

    return self.dropout(comb)

Define a learner using the previously specified pytorch module to create the RNN representation.

Then, specify learners that use a pytorch linear neural network to perform predictions based on that RNN representation.

In [9]:
from domiknows.sensor.pytorch.learners import ModuleLearner
from torch import nn

review['rnn_embed'] = ModuleLearner('text_embed', module=LSTMModule())

review[positive] = ModuleLearner('rnn_embed', module=nn.Linear(hidden_size * 2, num_classes))
review[negative] = ModuleLearner('rnn_embed', module=nn.Linear(hidden_size * 2, num_classes))

Define a learnable model from the previously specified graph.

In [10]:
from domiknows.program import SolverPOIProgram
from domiknows.program.metric import MacroAverageTracker, PRF1Tracker, DatanodeCMMetric
from domiknows.program.loss import NBCrossEntropyLoss

program = SolverPOIProgram(graph, inferTypes=['ILP', 'local/argmax'], loss=MacroAverageTracker(NBCrossEntropyLoss()), metric={'ILP':PRF1Tracker(DatanodeCMMetric()),'argmax':PRF1Tracker(DatanodeCMMetric('local/argmax'))})

Load the IMDB data



In [11]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/
!echo 'Extracting aclImdb_v1.tar.gz...'
!tar -xzf data/aclImdb_v1.tar.gz -C data/

import glob
import random
import os

def get_data(directory, label):
  data_all = []
  for path in glob.glob(os.path.join(directory, label + '/*.txt')):
    data_dict = {}
    with open(path, 'r') as f:
      data_dict['text'] = f.read()
      data_dict['positive'] = [1 if label == 'pos' else 0]
      data_dict['negative'] = [1 if label == 'neg' else 0]
    data_all.append(data_dict)
  return data_all

train_data = get_data('data/aclImdb/train', 'pos')
train_data.extend(get_data('data/aclImdb/train', 'neg'))
random.shuffle(train_data)

test_data = get_data('data/aclImdb/test', 'pos')
test_data.extend(get_data('data/aclImdb/test', 'neg'))
random.shuffle(test_data)


print(train_data[0])


--2023-07-11 19:36:04--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data/aclImdb_v1.tar.gz’


2023-07-11 19:36:08 (23.7 MB/s) - ‘data/aclImdb_v1.tar.gz’ saved [84125825/84125825]

Extracting aclImdb_v1.tar.gz...
{'text': "Now I recently had the viewing pleasure to watch the hilarious comedy Bachelor Party, one of my new favorite comedies, laughed until it just hurt type of movies. So I naturally wanted to see the sequel, hoping it would have the same laughs, but instead Bachelor Party 2: The Last Temptation is made by the American Pie generation where it's tasteless and defeats the hole purpose of the first film. Yeah, the first film has nudity, but it doesn't show in every single scene. Also the plot is exactly the same from th

Train the model

In [12]:
split_idx = int(len(train_data)*0.8)

program.train(train_data[:5000],
              valid_set=train_data[split_idx:split_idx+2000],
              test_set=test_data[:2000], train_epoch_num=10, Optim=torch.optim.Adam, device='cuda')

Epoch 1 Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Log file for ilpOntSolver is in: /content/DomiKnowS/logs/ilpOntSolver.log
Log file for ilpOntSolverTime is in: /content/DomiKnowS/logs/ilpOntSolver.log


Epoch 1 Training: 100%|██████████| 5000/5000 [04:58<00:00, 16.78it/s]
Epoch 1 Validation: 100%|██████████| 2000/2000 [01:48<00:00, 18.43it/s]
Epoch 2 Training: 100%|██████████| 5000/5000 [04:42<00:00, 17.69it/s]
Epoch 2 Validation: 100%|██████████| 2000/2000 [01:38<00:00, 20.30it/s]
Epoch 3 Training: 100%|██████████| 5000/5000 [04:53<00:00, 17.04it/s]
Epoch 3 Validation: 100%|██████████| 2000/2000 [01:45<00:00, 18.90it/s]
Epoch 4 Training: 100%|██████████| 5000/5000 [04:43<00:00, 17.61it/s]
Epoch 4 Validation: 100%|██████████| 2000/2000 [01:43<00:00, 19.31it/s]
Epoch 5 Training: 100%|██████████| 5000/5000 [04:49<00:00, 17.28it/s]
Epoch 5 Validation: 100%|██████████| 2000/2000 [01:46<00:00, 18.70it/s]
Epoch 6 Training: 100%|██████████| 5000/5000 [04:55<00:00, 16.94it/s]
Epoch 6 Validation: 100%|██████████| 2000/2000 [01:45<00:00, 18.87it/s]
Epoch 7 Training: 100%|██████████| 5000/5000 [04:49<00:00, 17.28it/s]
Epoch 7 Validation: 100%|██████████| 2000/2000 [01:46<00:00, 18.84it/s]
Epoch 