# 1. 与 Google Drive 连接

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

dir_path = '/content/drive/MyDrive/2023NLPCourse/Assignment2/'

%cd /content/drive/MyDrive/2023NLPCourse/Assignment2/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/2023NLPCourse/Assignment2


# 2. 安装并导入相应的包

In [16]:
!pip install pyvis==0.3.1
!pip install wikipedia
from pyvis import network
import networkx as nx
import pickle
import wikipedia

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 3. 导入保存的 kb 文件，用于生成知识图谱

In [3]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
          #page = wikipedia.page(candidate_entity, auto_suggest=False)
          page = wikipedia.page(candidate_entity, auto_suggest=False)

          entity_data = {
            "title": page.title,
            "url": page.url,
            "summary": page.summary
          }
          return entity_data
        except:
          entity_data = {
            "title": candidate_entity+"*",
            "url": "",
            "summary": ""
          }
          return entity_data
          #return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")
            
kb = pickle.load(open("Data/Rebel.kb", "rb"))

# 4. 导入 CoreNLP 额外的的关系对

In [54]:
with open("Data/CoreNLP_Expanded_kp.txt","r") as f:
    text = f.readlines()

# 生成实体与关系列表
Relations = []
Entities_lis = []
for i in range(len(text)):
    relation = text[i].replace("\n", '').replace("per:", '').replace("org:", '').split("\t")
    Relations.append({"head":relation[0], "type":relation[1], "tail":relation[2]})
    Entities_lis.append(relation[0])
    Entities_lis.append(relation[2])
Entities = list(set(Entities_lis))

# 6. 实体合并

In [11]:
All_Ens = []
for e in kb.entities:
    All_Ens.append(e)
for i in Entities:
    if i not in All_Ens:
        All_Ens.append(i)

# 7. 根据合并的实体，关系对，通过 pyvis 库的 net 模块生成知识图谱

In [18]:
net = network.Network(
    directed=True,
    width="1200px",
    height="1000px",
    bgcolor="#FFFFFF",
    notebook=True,
    )

# 节点
color_entity = "#00FF00"
for e in All_Ens:
    net.add_node(e)
    print("add note",e)

# 边
for r in kb.relations:
    net.add_edge(r["head"], r["tail"], title=r["type"], label=r["type"])
    print("add relation",r["head"]," ",r["tail"])

for i in Relations:
    net.add_edge(i["head"], i["tail"], title=i["type"], label=i["type"])

net.repulsion(node_distance=230, damping=0.01)

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
add note John McCarthy*
add note Computer scientist
add note Turing Award
add note United States National Medal of Science*
add note Kyoto Prize
add note September 4, 1927*
add note Stanford University
add note ALGOL
add note Cromane
add note County Kerry
add note Ireland
add note Republican*
add note Alan Turing
add note Princeton University
add note Marvin Minsky
add note Allen Newell
add note Herbert A. Simon
add note Donald C. Spencer
add note Nathaniel Rochester
add note Artificial intelligence
add note Claude Shannon
add note ALGOL 60
add note August 1959
add note Compatible Time-Sharing System
add note 1961
add note time-sharing systems*
add note BBN Time-Sharing System
add note Dartmouth Time Sharing System
add note Space fountain
add note 1982
add note The Robot and the Baby*
add note 2001
add note Short story
add note Social network
add note Internet culture
add note Carolyn Talcott
add note SR

# 8. 保存生成的知识图谱到 html 文件

In [None]:
net.set_edge_smooth('dynamic')
net.show('Data/Graph/Expanded.html')

# 9 将实体与关系保存到 CSV 文件，以便直接观察

In [62]:
# 1. All Entities versions
Rebel_Entities = []
CoreNLP_expanded_Entities = Entities[:]
Merged_Entities = net.get_nodes()
for e in kb.entities:
    Rebel_Entities.append(e)

In [63]:
# 2. All Relations versions
Rebel_Relations = []
CoreNLP_expanded_Relations = Relations[:]
for r in kb.relations:
    Rebel_Relations.append({"head":r["head"],"type":r["type"] ,"tail":r["tail"]})
Merged_Relations = Rebel_Relations + CoreNLP_expanded_Relations

In [66]:
Rs = [Rebel_Relations, CoreNLP_expanded_Relations, Merged_Relations]
for i in Rs:
    print(str(i))

[{'head': 'John McCarthy*', 'type': 'occupation', 'tail': 'Computer scientist'}, {'head': 'John McCarthy*', 'type': 'award received', 'tail': 'Turing Award'}, {'head': 'John McCarthy*', 'type': 'award received', 'tail': 'United States National Medal of Science*'}, {'head': 'John McCarthy*', 'type': 'award received', 'tail': 'Kyoto Prize'}, {'head': 'John McCarthy*', 'type': 'date of birth', 'tail': 'September 4, 1927*'}, {'head': 'John McCarthy*', 'type': 'employer', 'tail': 'Stanford University'}, {'head': 'ALGOL', 'type': 'designed by', 'tail': 'John McCarthy*'}, {'head': 'Cromane', 'type': 'located in the administrative territorial entity', 'tail': 'County Kerry'}, {'head': 'Cromane', 'type': 'country', 'tail': 'Ireland'}, {'head': 'County Kerry', 'type': 'country', 'tail': 'Ireland'}, {'head': 'Ireland', 'type': 'contains administrative territorial entity', 'tail': 'County Kerry'}, {'head': 'John McCarthy*', 'type': 'member of political party', 'tail': 'Republican*'}, {'head': 'Ala

In [68]:
# 3. Get them all to csv files
import pandas as pd
Rs = {"Rebel_Relations":Rebel_Relations, "CoreNLP_expanded_Relations":CoreNLP_expanded_Relations, "Merged_Relations":Merged_Relations}
for name,lis in Rs.items():
    df = pd.DataFrame(lis)
    df.to_csv("Data/Entities_Relations/"+name+'.csv', index=False)


Es = {"Rebel_Entities":Rebel_Entities, "CoreNLP_expanded_Entities":CoreNLP_expanded_Entities, "Merged_Entities":Merged_Entities}
for name,lis in Es.items():
    df = pd.DataFrame(lis)
    df.to_csv("Data/Entities_Relations/"+name+'.csv', index=False)