# Automatic corpus building using Wikipedia

## #1. Setup development environment

### Update & import Python modules

In [1]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg # using small model (sm)
!pip install wikipedia
!pip install bs4

# spaCy
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Beautiful Soup
from bs4 import BeautifulSoup

# Wikipedia API
import wikipedia

# general Python modules
import json
import datetime
import requests
from pprint import pprint
import re

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.5.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.

### Get access to Firebase and Drive 

In [2]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

# open Firebase credentials
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/credentials/firebase_credentials.json") as f:
  credential = json.load(f)
credential = credentials.Certificate(credential)

# create Firestore database instance
firebase_admin.initialize_app(credential)
db = firestore.client()
print("Stablished access to Firestore")

Mounted at /content/gdrive/
Stablished access to Google Drive
Stablished access to Firestore


In [3]:
# Wikipedia slug of entity
entity_slug_list = ["Berlin", "Munich", "Germany", "art_museum", "Bavaria"]

# Wikidata id of entity
qids = ["Q64", "Q1726", "Q183", "Q207694", "Q707669"]

In [4]:
for entity_slug in entity_slug_list: 

  # parse text from a Wikipedia page, from p elements
  r = requests.get(f"https://en.wikipedia.org/wiki/{entity_slug}")
  soup = BeautifulSoup(r.text, "html.parser")
  p_els = soup.find_all("p")
  text = [p.text for p in p_els]

  # basic text preprocessing
  processed_text = []
  for p in text:
    p = p.replace("\n", "") # remove new line chars
    p = p.lstrip() # remove leading blank spaces
    p = p.rstrip() # remove trailing blank space
    if p == "": # ignore empty paragraphs
      continue
    # remove citation numbers [x]
    regex_wikipedia_citation = "(\[\d+(,\s?\d+|\d*-\d+)*\])"
    loops = 0
    while loops < 6:
      loops += 1
      match = re.search(regex_wikipedia_citation, p)
      if match:
        string = match.group()
        p = p.replace(string, "")

    processed_text.append(p)
  text = processed_text

  # initialize spaCY pipeline and container of sentences
  nlp = spacy.load("en_core_web_lg")
  sentences_container = []

  # split text into sentences
  for index, paragraph in enumerate(text):
    # split paragraph in sentences
    sentences = [sent.text for sent in nlp(paragraph).sents]
    sentences_container.extend(sentences)

  # save record in JSON file
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/output/{entity_slug.lower()}_context_texts.json", "w", encoding = "utf-8") as f:
    json.dump(sentences_container, f, ensure_ascii = False, indent = 2)
    print(f"Saved {len(sentences_container)} context sentences")

Saved 632 context sentences
Saved 567 context sentences
Saved 395 context sentences
Saved 106 context sentences
Saved 290 context sentences


In [5]:
""" Retrieve entity info from Wikidata and make a list of aliases, by combining label + aliases """

for qid, entity_slug in zip(qids, entity_slug_list):

  # fetch entity info from the Wikidata API (entity endpoint)
  api_url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
  r = requests.get(api_url, params={"format": "json"})
  # simplify access to root elements of JSON object
  entity_info = r.json()["entities"][f"{qid}"]

  # get entity aliases
  if entity_info["aliases"].get("en"):
    aliases = [a["value"] for a in entity_info["aliases"]["en"]] if entity_info["aliases"].get("en") else []

  # create container of gazetteers
  gazetteers = aliases

  # get entity name
  if entity_info["labels"].get("en"):
    gazetteers.append(entity_info["labels"]["en"]["value"])

  # get last name
  last_name = entity_info["labels"]["en"]["value"].split()[-1]
  gazetteers.append(last_name)


  pprint(gazetteers)


  # initialize spaCY phrase matcher (rule-based)
  matcher = PhraseMatcher(nlp.vocab, None)
  # load issues as gazetteers
  patterns = [nlp.make_doc(g) for g in gazetteers]
  matcher.add("gazetteers", patterns)

  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/output/{entity_slug.lower()}_context_texts.json") as f:
    text = json.load(f)

  main_text_container = []

  for index, paragraph in enumerate(text):
    # split paragraph in sentences
    sentences = [sent.text for sent in nlp(paragraph).sents]

    # instance a pipeline to process sentences individually
    disabled_pipelines = ["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]
    for doc in nlp.pipe(sentences, batch_size=50, disable=disabled_pipelines):
      sent = doc.text  # sentence

      # identify gazetteer contained in Doc object (text)
      gazetteers = matcher(doc)
      # convert gazetteers as spans
      gazetteers = [doc[start:end] for _, start, end in gazetteers]
      # filter overlaping matches (spans) - keep gazetteers uniqueness
      filtered_matches = spacy.util.filter_spans(gazetteers)

      # filter sentences with gazetteers occurrences
      sentence_data = []
      if len(filtered_matches):
        sentence_data.append(sent)
        entities = []
        for m in filtered_matches:
          span = doc[m.start:m.end]  # identify span
          matched_gazetteer = span.text
          match_info = (span.start_char, span.end_char, "ENTITY")
          entities.append(match_info)
        sentence_data.append({"entities": entities})
        main_text_container.append(sentence_data)

  # save record in JSON file
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/output/{entity_slug.lower()}_ner_corpus.json", "w", encoding = "utf-8") as f:
    json.dump(main_text_container, f, ensure_ascii = False, indent = 2)
    print(f"Saved {len(main_text_container)} annotated sentences")

['Berlin, Germany', 'Berlin (Germany)', 'DE-BE', 'Berlin', 'Berlin']
Saved 311 annotated sentences
['München',
 'Munchen',
 'Muenchen',
 'Minga',
 'Monachium',
 'Münich',
 'Munich',
 'Munich']
Saved 271 annotated sentences
['Federal Republic of Germany',
 'Deutschland',
 'GER',
 'BR Deutschland',
 'DE',
 'BRD',
 'Bundesrepublik Deutschland',
 'de',
 'GFR',
 'Germany',
 'Germany']
Saved 148 annotated sentences
['museum of art', 'art museums', 'museums of art', 'art museum', 'museum']
Saved 33 annotated sentences
['Bavaria statue', 'Bavaria', 'Bavaria']
Saved 91 annotated sentences
