In [1]:
# install and download spaCy related modules
!pip install --upgrade spacy
!python -m spacy download en_core_web_lg

# Google Drive
from google.colab import drive

# Firebase/Firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# BeautifulSoup
from bs4 import BeautifulSoup

# Tweepy
import tweepy

# general Python modules
import re
import json
import spacy
from spacy.language import Language
import requests
import datetime
from dateutil.relativedelta import relativedelta
from pprint import pprint
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.5.0
2023-02-13 15:44:25.814559: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (one



In [2]:
from google.colab import drive
drive.mount('/content/drive/')

DRIVE_PATH = "/content/drive/MyDrive"

# # open Firebase credentials
# with open(DRIVE_PATH + "/IE/ie_course_2022_team03/credentials/firebase_credentials.json") as f:
#   credential = json.load(f)
# credential = credentials.Certificate(credential)

# # create Firestore database instance
# firebase_admin.initialize_app(credential)
# db = firestore.client()
# print("Stablished access to Firestore")

Mounted at /content/drive/


In [3]:
# retrieve list of entities gazetteers
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/assets/entities_gazetteers.json") as f:
  entities_gazetteers_list = json.load(f)
  print(f"Retrieved entities gazetteers list")
  
# retrieve ignore list of entities gazetteers
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/assets/entities_gazetteers_ignore_list.json") as f:
  entities_gazetteers_ignore_list = json.load(f)
  print(f"Retrieved entities gazetteers' ignore list")

# retrieve entities
with open(DRIVE_PATH + "/IE/ie_course_2022_team03/assets/entities.json") as f:
  entities = json.load(f)
  print(f"Retrieved entities")

Retrieved entities gazetteers list
Retrieved entities gazetteers' ignore list
Retrieved entities


In [None]:
""" Custom pipeline Component: entities gazetteer function """

@Language.component("entities_gazetteer")
def entities_gazetteer(doc):
  # set up and extend structure of default span object
  span = Span(doc, 0, 0, "")
  span.set_extension("qid", default=None, force=True)
  span.set_extension("label", default=None, force=True)
  span.set_extension("wd_name", default=None, force=True)

  # identify matches of the gazetteers contained in Doc object (text)
  matches = matcher(doc)
  # convert matches to Span objects
  spans = [doc[start:end] for _, start, end in matches]
  # filter overlaping matches (Span objs) to keep gazetteers uniqueness
  filtered_matches = spacy.util.filter_spans(spans)

  # loop unique matches of gazetteers
  for match in filtered_matches:
    # skip if matched gazetter is in ignore list
    if match.text in entities_gazetteers_ignore_list:
      print(f"-- Skipped '{match}' due it's in ignore list!")
      continue
    # find matched gazetters in issues dictionary to get entities' Wikidata info
    # usually only one entity is found, but some gazetteer finds more than one
    matched_entities = [i for i in entities if match.text == i["name"] or match.text in i["aliases"]]
    if len(matched_entities):
      entity = Span(doc, match.start, match.end, label=matched_entities[0]["label"])

      # set attributes
      if len(matched_entities) == 1:
        entity._.label = matched_entities[0]["label"]
      elif len(matched_entities) > 1:
        entity._.label = [e["label"] for e in matched_entities]

      entity.set_extension("qid", default=None, force=True)
      if len(matched_entities) == 1:
        entity._.qid = matched_entities[0]["qid"]
      elif len(matched_entities) > 1:
        entity._.qid = [e["qid"] for e in matched_entities]

      entity.set_extension("wd_name", default=None, force=True)
      if len(matched_entities) == 1:
        entity._.wd_name = matched_entities[0]["name"]
      elif len(matched_entities) > 1:
        entity._.wd_name = [e["name"] for e in matched_entities]

      # modify the provided entity spans, leaving the rest unmodified
      doc.set_ents([entity], default="unmodified")

  return doc

# create pipeline loaded with a pretrained statistical model (English/lg)
nlp = spacy.load("en_core_web_lg", exclude=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
nlp.add_pipe("sentencizer")

# add custom component to pipeline
nlp.add_pipe("entities_gazetteer", last=True)

# initialize spaCY phrase matcher (rule-based)
matcher = PhraseMatcher(nlp.vocab, None)

# load gazetteers (issues) as matcher patterns
patterns = [nlp.make_doc(gazetteer) for gazetteer in entities_gazetteers_list]
matcher.add("gazetteers", patterns)

# see pipeline components
print(nlp.pipe_names)

# analize pipeline
pprint(nlp.analyze_pipes(pretty=True))

['ner', 'sentencizer', 'entities_gazetteer']
[1m

#   Component            Assigns               Requires   Scores          Retokenizes
-   ------------------   -------------------   --------   -------------   -----------
0   ner                  doc.ents                         ents_f          False      
                         token.ent_iob                    ents_p                     
                         token.ent_type                   ents_r                     
                                                          ents_per_type              
                                                                                     
1   sentencizer          token.is_sent_start              sents_f         False      
                         doc.sents                        sents_p                    
                                                          sents_r                    
                                                                                     
2  

### Test pipeline with one sentence

In [None]:
sentence = "POTUS 46 was born and raised in Scranton, Pennsylvania, and moved with his family to Delaware in 1953 when he was ten years old."
# initialize = nlp.initialize()
doc = nlp(sentence)
for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_, ent._.qid, ent._.wd_name)

POTUS 46 0 8 PRODUCT None None
Scranton 32 40 GPE None None
Pennsylvania 42 54 GPE None None
Delaware 85 93 GPE None None
1953 97 101 DATE None None
ten years old 114 127 DATE None None


In [4]:
list_files = !ls /content/drive/MyDrive/IE/ie_course_2022_team03/retrieved_data/scrapped_data

In [None]:
# initialize main container of text
main_text_container = []
# main_text_woc_container = []
for file in list_files:
  print("Reading from file", file)
  with open(DRIVE_PATH + f"/IE/ie_course_2022_team03/retrieved_data/scrapped_data/{file}") as f:
    file_data = json.load(f)
    text = file_data["text"]
  # split text in paragraphs
  for index, paragraph in enumerate(text):
    # split paragraph in sentences
    sentences = [sent.text for sent in nlp(paragraph).sents]

    # process sentences individually in a stream (multi-thread)
    for doc in nlp.pipe(sentences, batch_size=50):
      sent = doc.text  # sentence
      chosen_sentence = [sent, {"entities": []}]
      for ent in doc.ents:
        if ent._.qid:
          chosen_sentence[1]["entities"].append([ent.start_char, ent.end_char, ent._.qid, ent.text, ent.label_, ent._.label, ent._.wd_name])
      if len(chosen_sentence[1]["entities"]):
        main_text_container.append(chosen_sentence)
        print()
        print(f"++ {sent}")
        for ent in doc.ents:
          if ent._.qid:
            print(f"    {ent.text, ent.start_char, ent.end_char, ent.label_, ent._.label, ent._.qid, ent._.wd_name}")
        print()
        
# save record in JSON file
if len(main_text_container):
  with open(DRIVE_PATH + "/IE/ie_course_2022_team03/output/extracted_sentences.json", "w", encoding = "utf-8") as f:
    json.dump(main_text_container, f, ensure_ascii = False, indent = 2)
    print()
    print(f"Saved {len(main_text_container)} extracted_sentences")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    ('Berlin', 2, 8, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 
          © Berlin Tourismus & Kongress GmbH
        
    ('Berlin', 13, 19, 'GPE', 'GPE', 'Q64', 'Berlin')

Reading from file visitberlindeenticketseventsattractionsberlin.json

++ 

Berlin Unboxed: The visitBerlin Podcast




    ('Berlin', 2, 8, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 

Museums in Berlin




    ('Berlin', 13, 19, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 

4 days Berlin from €190




    ('Berlin', 9, 15, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 
Your entrance tickets to Berlin

    ('Berlin', 26, 32, 'GPE', 'GPE', 'Q64', 'Berlin')


++ Find top experiences in Berlin
    ('Berlin', 24, 30, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 
Enjoy a unique view over the Skyline of Berlin

    ('Berlin', 41, 47, 'GPE', 'GPE', 'Q64', 'Berlin')


++ 
Explore Berlin with popular bus tours in Berlin

    ('Berlin', 9, 15, 'GPE', 'GPE', 'Q64', 'Berlin')
    ('Berlin', 42, 48, 'GPE