In [None]:
# 1. Link with GoogleDrive for easy file import
#     The following two steps are how to link colab(fixed)
from google.colab import drive
drive.mount('/content/drive/')

# Optional, one gives the path directly and then just imports it
#     Another, %cd to that path and then import by filename

dir_path = '/content/drive/MyDrive/2023NLPCourse/Assignment2/'

# The second one goes path
%cd /content/drive/MyDrive/2023NLPCourse/Assignment2/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/2023NLPCourse/Assignment2


# New Section

In [None]:
# 2. install for libraries
!pip install transformers wikipedia newspaper3k GoogleNews pyvis==0.3.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network

In [None]:
# 3. Load ReBEL model and tokenzier
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [None]:
# 4. Define the functions for generating kb file
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_publish_date=None, verbose=False, extend_kb=None):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    if(extend_kb is None):
      kb = KB()
    else:
      kb = extend_kb

    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb

class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
          #page = wikipedia.page(candidate_entity, auto_suggest=False)
          page = wikipedia.page(candidate_entity, auto_suggest=False)

          entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
          }
          return entity_data
        except:
          entity_data = {
            "title": candidate_entity+"*",
            "url": "",
            "summary": ""
          }
          return entity_data
          #return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [None]:
# 6. load text and generate kb
fp = open("Data/Data.txt", "r") ; text = fp.read() ; fp.close()
kb = from_text_to_kb(text, "", verbose=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1210 > 1024). Running this sequence through the model will result in indexing errors


Input has 1210 tokens
Input has 10 spans
Span boundaries are [[0, 128], [120, 248], [240, 368], [360, 488], [480, 608], [600, 728], [720, 848], [840, 968], [960, 1088], [1080, 1208]]




  lis = BeautifulSoup(html).find_all('li')


In [None]:
# 7. save kb
import pickle
pickle.dump(kb, open("Data/Rebel.kb", 'wb'))

In [10]:
kb.print()

Entities:
  ('John McCarthy*', {'url': '', 'summary': ''})
  ('Computer scientist', {'url': 'https://en.wikipedia.org/wiki/Computer_scientist', 'summary': 'A computer scientist is a scholar who specializes in the academic study of computer science.Computer scientists typically work on the theoretical side of computation, as opposed to the hardware side on which computer engineers mainly focus (although there is overlap). Although computer scientists can also focus their work and research on specific areas (such as algorithm and data structure development and design, software engineering, information theory, database theory, computational complexity theory, numerical analysis, programming language theory, computer graphics, and computer vision), their foundation is the theoretical study of computing from which these other fields derive.A primary goal of computer scientists is to develop or validate models, often mathematical, to describe the properties of computational systems (processo

In [12]:
for i in kb.entities:
  print(i)

John McCarthy*
Computer scientist
Turing Award
United States National Medal of Science*
Kyoto Prize
September 4, 1927*
Stanford University
ALGOL
Cromane
County Kerry
Ireland
Republican*
Alan Turing
Princeton University
Marvin Minsky
Allen Newell
Herbert A. Simon
Donald C. Spencer
Nathaniel Rochester
Artificial intelligence
Claude Shannon
ALGOL 60
August 1959
Compatible Time-Sharing System
1961
time-sharing systems*
BBN Time-Sharing System
Dartmouth Time Sharing System
Space fountain
1982
The Robot and the Baby*
2001
Short story
Social network
Internet culture
Carolyn Talcott
SRI International
October 24, 2011*


In [13]:
for r in kb.relations:
    print(r)

{'head': 'John McCarthy*', 'type': 'occupation', 'tail': 'Computer scientist', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'John McCarthy*', 'type': 'award received', 'tail': 'Turing Award', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'John McCarthy*', 'type': 'award received', 'tail': 'United States National Medal of Science*', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'John McCarthy*', 'type': 'award received', 'tail': 'Kyoto Prize', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'John McCarthy*', 'type': 'date of birth', 'tail': 'September 4, 1927*', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'John McCarthy*', 'type': 'employer', 'tail': 'Stanford University', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'ALGOL', 'type': 'designed by', 'tail': 'John McCarthy*', 'meta': {'': {'spans': [[0, 128]]}}}
{'head': 'Cromane', 'type': 'located in the administrative territorial entity', 'tail': 'County Kerry', 'meta': {'': {'spans': [[120, 248]]}}}
{'head': 'Cromane', 'type': 'count