In [1]:
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import regex as re
import nltk
stops = set(stopwords.words('english'))


####################
####  tokenizor ####
####################
## using nltk tokenization library to split txt into tokens
## clean up the data by removing punctuation, regex, numbers, lemmatization
## add the bigrams by calling the function
## find and sort the token frequency

def tokenizor(text):
    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = soup.find_all(['p', 'a', 'li', 'span', 'h1', 'h2', 'h3', 'title'])
    cleaned_tokens = []

    for p in paragraphs:
        cleaned_paragraphs = p.get_text()

        ## clean up the texts from numbers, punctuation
        remove = re.sub('<.+?>|\n', '', str(cleaned_paragraphs))
        remove = remove.replace('\\n', '')
        remove = re.sub(r'[^\w\s]|[0-9]', '', remove.strip().lower())

        ## tokenize and lemmatize
        tokens = word_tokenize(remove)

        wnl = WordNetLemmatizer()
        lemmatized_tokens = []
        for t in tokens:
            lemmatized_token = wnl.lemmatize(t)
            lemmatized_tokens.append(lemmatized_token)
            cleaned_tokens += [t for t in lemmatized_tokens if t not in stops]
    # return (cleaned_tokens)

    the_frequency = [(token, cleaned_tokens.count(token)) for token in set(cleaned_tokens)]
    the_frequency = sorted(the_frequency, key=lambda x: x[1], reverse=True)
    return(the_frequency)


In [2]:
########################################
####  single term ranked retrieval  ####
########################################
import json 
import os

filename = os.listdir('./ueapeople/')

vocab = {}
docIDs = {}
postings = {}
nextvocabID = 0
nextdocID = 0

for f_name in filename:
    path = "./ueapeople/" + f_name
    f = open(path, "r")
    content = f.read()
    f.close()

    docIDs[nextdocID] = f_name
    tokenfreq = tokenizor(content)

    for t, freq in tokenfreq: 
        if t in vocab.keys(): 
            vocabID = vocab[t]
        else: 
            vocab[t] = int(nextvocabID)
            vocabID = nextvocabID
            nextvocabID += 1

        if vocabID in postings.keys():
            postings[vocabID][nextdocID] = freq 
        else:
            postings[vocabID] = {nextdocID: freq} 

    nextdocID += 1

In [3]:
with open('vocab.txt', 'w') as f: 
    json.dump(vocab, f)
with open('postings.txt', 'w') as f: 
    json.dump(postings, f)
with open('docids.txt', 'w') as f: 
    json.dump(docIDs, f)
    
with open('vocab.txt', 'r') as f: 
    vocab = json.load(f)
with open('postings.txt', 'r') as f: 
    postings = json.load(f)
with open('docids.txt', 'r') as f: 
    docIDs = json.load(f)

In [4]:
def single_search_engine():
    while True:
        query = input("What would you like to query? (type quit to exit): ")
        if query == "quit": 
            print('You have quit the query.')
            break
        final = []
        if query in vocab:
            vocabID = vocab[query]
            
            for d, freq in postings[str(vocabID)].items():
                final.append((docIDs[str(d)], freq))
                final.sort(key = lambda e: e[1], reverse=True)
                [print(f'{doc} for {freq} times') for doc, freq in final]
        else:
            print('Term not found.')
            
single_search_engine()

What would you like to query? (type quit to exit): jason
adam-dawson.html for 7 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
colin-kay.html for 39 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
colin-kay.html for 39 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
david-w-j-thompson.html for 4 times
helen-pallett.html for 40 times
colin-kay.html for 39 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
david-w-j-thompson.html for 4 times
helen-pallett.html for 40 times
colin-kay.html for 39 times
jason-chilvers.html for 8 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
david-w-j-thompson.html for 4 times
jason-corner.html for 60 times
helen-pallett.html for 40 times
colin-kay.html for 39 times
jason-chilvers.html for 8 times
adam-dawson.html for 7 times
anja-mueller.html for 4 times
david-w-j-thompson.html for 4 times
jason-corner.html for 60 times
helen-pallett.html for 40 times
colin-kay.html for 39 

In [56]:
#######################################
####  multi-term ranked retrieval  ####
#######################################
import math

with open('vocab.txt', 'r') as f: vocab = json.load(f)
with open('postings.txt', 'r') as f: postings = json.load(f)
with open('docids.txt', 'r') as f: docIDs = json.load(f)


def find_frequency(tID, dID):
    tID, dID = str(tID), str(dID)
    if dID in postings[tID]:
        return postings[tID][dID]
    else:
        return 0
    
def multi_term_search_engine():
    while True:
        queries = input('Enter terms with space (type quit to exit): ')
        if queries == 'quit':
            print('You have quit the query.')
            break
            
        docids = {}
        query_tID = []
            
        queries = queries.split(" ") 
        for query in queries:
            if query in vocab:
                vocabID = vocab[query]
                query_tID.append(vocabID)
                document = postings[str(vocabID)]
                docids = {**docids, **document}

        if len(query_tID) >= 1:
            final = []
            for did, freq in docids.items():
                final_score = 0
                for tid in query_tID:
                    tf = find_frequency(tid, did)
                    N = len(docIDs)
                    df = len(postings[str(tid)])
                    idf = math.log(N/df)

                    final_score += tf*idf
                final.append((docIDs[str(did)], final_score))

            final.sort(reverse=True, key= lambda e: e[1])
            print(len(final))
            [print(f"Page: {doc[0]}, TF-IDF score: {doc[1]}") for doc in final]
            
        
        else:
            final = query_tID
        
        print()
        
    
multi_term_search_engine()
        

Enter terms with space (type quit to exit): marks school of pharmacy
933
Page: debi-bhattacharya.html, TF-IDF score: 151.29158498711394
Page: lindsay-morgan.html, TF-IDF score: 101.38075544283679
Page: catherine-heywood.html, TF-IDF score: 73.19529677362587
Page: harpreet-bassi.html, TF-IDF score: 67.76395405485933
Page: melanie-boughen.html, TF-IDF score: 56.402874820852496
Page: mark-searcey.html, TF-IDF score: 53.655245979038526
Page: miriam-craske.html, TF-IDF score: 45.54018938331936
Page: michael-twigg.html, TF-IDF score: 40.67115542588787
Page: james-desborough.html, TF-IDF score: 39.57849538564842
Page: gemma-may.html, TF-IDF score: 35.2398127071213
Page: andrea-nunney.html, TF-IDF score: 31.993790068833636
Page: tim-rowland.html, TF-IDF score: 27.591192425445147
Page: bridget-hemmant.html, TF-IDF score: 27.012401167374307
Page: nigel-norris.html, TF-IDF score: 26.000138588732003
Page: terry-haydn.html, TF-IDF score: 24.345169787157484
Page: sabine-jacques.html, TF-IDF score: 2

Enter terms with space (type quit to exit): quit
You have quit the query.


In [54]:
###########################################################
####  Upweighting document zone in tokenizor function  ####
###########################################################

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
stops = set(stopwords.words('english'))

def weight_tokenizor(text):
    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = soup.find_all('p')
    headers = soup.find_all('h1')
    headers += soup.find_all('h2')
    headers += soup.find_all('h3')
    titles = soup.find_all('title')
    
    cleaned_tokens = []
    for p in paragraphs:
        cleaned_paragraphs = p.get_text()
        
        ## clean up the texts eg. numbers, punctuation
        remove = re.sub('<.+?>|\n', '', str(cleaned_paragraphs))
        remove = remove.replace('\\n', '')
        remove = re.sub(r'[^\w\s]|[0-9]', '', remove.strip().lower())
        
        tokens = word_tokenize(remove)
        cleaned_tokens += [t for t in tokens if t not in stops]
    p_freq = [(token, cleaned_tokens.count(token)) for token in set(cleaned_tokens)]

        
        
    cleaned_tokens = []
    for h in headers:
        cleaned_headers = p.get_text()
        
        ## clean up the texts from headers
        remove = re.sub('<.+?>|\n', '', str(cleaned_headers))
        remove = remove.replace('\\n', '')
        remove = re.sub(r'[^\w\s]|[0-9]', '', remove.strip().lower())

        tokens = word_tokenize(remove)
        cleaned_tokens += [t for t in tokens if t not in stops]
    h_freq = [(token, cleaned_tokens.count(token)) for token in set(cleaned_tokens)]  
        
        
    cleaned_tokens = []
    for t in titles:
        cleaned_titles = p.get_text()
        
        ## clean up the text from title
        remove = re.sub('<.+?>|\n', '', str(cleaned_titles))
        remove = remove.replace('\\n', '')
        remove = re.sub(r'[^\w\s]|[0-9]', '', remove.strip().lower())
        tokens = word_tokenize(remove)
        cleaned_tokens += [t for t in tokens if t not in stops]
    t_freq = [(token, cleaned_tokens.count(token)) for token in set(cleaned_tokens)]

    return {'p': p_freq, 'h': h_freq, 't': t_freq}

In [7]:
##########################################################
####  adding weighting from different document zones  ####
##########################################################

import json 
import os

filename = os.listdir('./ueapeople/')

vocab = {}
docIDs = {}
postings = {}
nextvocabID = 0
nextdocID = 0

weighting = {'p': 1, 'h': 2, 't': 3}

for f_name in filename:
    path = "./ueapeople/" + f_name
    f = open(path, "r")
    content = f.read()
    f.close()

    docIDs[nextdocID] = f_name
    token_zone = weight_tokenizor(content)

    for zone, tokenfreq in token_zone.items(): 
        for t, freq in tokenfreq:
            if t in vocab.keys(): 
                vocabID = vocab[t]
            else: 
                vocab[t] = int(nextvocabID)
                vocabID = nextvocabID
                nextvocabID += 1

            if vocabID in postings.keys():
                postings[vocabID][nextdocID] = freq * weighting[zone]
            else:
                postings[vocabID] = {nextdocID: freq * weighting[zone]} 

    nextdocID += 1

In [42]:
token_zone

{'p': [('received', 1),
  ('study', 2),
  ('state', 1),
  ('committee', 1),
  ('wwwesrcacuksearchsearchpageaspxqdrzografiabika', 1),
  ('esrc', 3),
  ('also', 6),
  ('training', 3),
  ('firm', 2),
  ('bibliometrics', 1),
  ('englandxcxa', 1),
  ('east', 3),
  ('interests', 1),
  ('june', 1),
  ('england', 5),
  ('universitas', 1),
  ('activities', 1),
  ('grant', 1),
  ('r', 1),
  ('principles', 1),
  ('fund', 2),
  ('disability', 1),
  ('businesses', 4),
  ('awarded', 3),
  ('sustainable', 1),
  ('grounds', 1),
  ('scottish', 3),
  ('msc', 2),
  ('publications', 1),
  ('influence', 1),
  ('member', 4),
  ('service', 1),
  ('prospective', 1),
  ('cap', 1),
  ('espon', 1),
  ('practitioners', 1),
  ('participating', 3),
  ('current', 1),
  ('receiving', 1),
  ('sociology', 2),
  ('thexcxainternational', 1),
  ('optivo', 1),
  ('professors', 1),
  ('work', 4),
  ('management', 6),
  ('bradford', 1),
  ('pasdecalais', 1),
  ('leading', 2),
  ('un', 1),
  ('enterprise', 2),
  ('employmentx

In [44]:
postings

{'0': {'0': 1, '1171': 1, '3422': 3},
 '1': {'0': 1,
  '170': 2,
  '183': 1,
  '296': 1,
  '315': 1,
  '633': 1,
  '787': 1,
  '820': 1,
  '837': 2,
  '1009': 1,
  '1244': 1,
  '1448': 2,
  '1470': 1,
  '1473': 1,
  '1481': 1,
  '1487': 1,
  '1557': 2,
  '1926': 1,
  '2004': 1,
  '2011': 1,
  '2183': 1,
  '2222': 1,
  '2249': 1,
  '2300': 1,
  '2312': 1,
  '2364': 1,
  '2492': 1,
  '2503': 1,
  '2594': 1,
  '2598': 1,
  '2684': 1,
  '2792': 1,
  '2840': 1,
  '2859': 7,
  '3174': 3,
  '3240': 1,
  '3242': 1,
  '3279': 1,
  '3309': 1,
  '3332': 1,
  '3384': 1,
  '3438': 1,
  '3504': 1,
  '3665': 1,
  '3682': 1,
  '3685': 2,
  '3787': 1,
  '4009': 2,
  '4030': 1,
  '4105': 4,
  '4277': 1,
  '4381': 1,
  '4393': 2,
  '4396': 1,
  '4413': 1,
  '4429': 1,
  '4495': 2,
  '4498': 1,
  '4533': 1,
  '4553': 1},
 '2': {'0': 1,
  '9': 1,
  '50': 1,
  '63': 2,
  '99': 1,
  '108': 2,
  '110': 2,
  '116': 3,
  '123': 1,
  '198': 1,
  '201': 1,
  '237': 4,
  '266': 1,
  '280': 1,
  '283': 2,
  '293': 

In [8]:
with open('vocab.txt', 'w') as f: 
    json.dump(vocab, f)
with open('postings.txt', 'w') as f: 
    json.dump(postings, f)
with open('docids.txt', 'w') as f: 
    json.dump(docIDs, f)
    
with open('vocab.txt', 'r') as f: 
    vocab = json.load(f)
with open('postings.txt', 'r') as f: 
    postings = json.load(f)
with open('docids.txt', 'r') as f: 
    docIDs = json.load(f)

In [9]:
import math

def find_frequency(tID, dID):
    tID, dID = str(tID), str(dID)
    if dID in postings[tID]:
        return postings[tID][dID]
    else:
        return 0

def weight_search_engine():
    while True:
            query = input('What would you like to query? (type quit to exit): ')
            if query == 'quit':
                print('You have quit the query.')
                break

            docids = {}
            query_tID = []

            query = query.split(" ") 
            for q in query:
                if q in vocab:
                    vocabID = vocab[q]
                    query_tID.append(vocabID)
                    document = postings[str(vocabID)]
                    docids = {**docids, **document}

            final = []
            for did, freq in docids.items():
                final_score = 0
                for tid in query_tID:
                    tf = math.log(1 + find_frequency(tid, did))
                    N = len(docIDs)
                    df = len(postings[str(tid)])
                    idf = math.log(N/df)

                    final_score += tf*idf
                final.append((docIDs[str(did)], final_score))

            final.sort( key= lambda e: e[1], reverse=True)
            print(len(final))
            [print(f"Page: {doc[0]}, TF-IDF score: {doc[1]}") for doc in final]


weight_search_engine()

What would you like to query? (type quit to exit): jason lines
35
Page: jason-lines.html, TF-IDF score: 18.129229270225245
Page: tony-bagnall.html, TF-IDF score: 14.989198745157248
Page: tim-rowland.html, TF-IDF score: 7.80783406262402
Page: nick-le-brun.html, TF-IDF score: 6.753138372739455
Page: oli-buckley.html, TF-IDF score: 5.621985828372938
Page: rita-ramos.html, TF-IDF score: 5.621985828372938
Page: will-rossiter.html, TF-IDF score: 5.621985828372938
Page: adam-dawson.html, TF-IDF score: 4.260755929346457
Page: colin-kay.html, TF-IDF score: 4.260755929346457
Page: helen-pallett.html, TF-IDF score: 4.260755929346457
Page: jason-corner.html, TF-IDF score: 4.260755929346457
Page: maria-oconnell.html, TF-IDF score: 4.260755929346457
Page: nem-vaughan.html, TF-IDF score: 4.260755929346457
Page: alison-bayton.html, TF-IDF score: 3.547078133277563
Page: andy-day.html, TF-IDF score: 3.547078133277563
Page: beatriz-de-la-iglesia.html, TF-IDF score: 3.547078133277563
Page: ben-milner.html

In [15]:
from sklearn.metrics import precision_score

actual_positives = [1 for _ in range(100)]
actual_negatives = [0 for _ in range(10000)]
y_true = actual_positives + actual_negatives

predicted_p = [0 for _ in range(10)] + [1 for _ in range(90)]
predicted_n = [1 for _ in range(30)] + [1 for _ in range(9970)]
y_predicted = predicted_p + predicted_n

precision = precision_score(y_true, y_predicted, average='binary')

print('precision calculated as: %3f' % precision)

precision calculated as: 0.008920


In [43]:
from sklearn.metrics import recall_score

actual_positives = [1 for _ in range(100)]
actual_negatives = [0 for _ in range(10000)]
y_true = actual_positives + actual_negatives

predicted_p = [0 for _ in range(10)] + [1 for _ in range(90)]
predicted_n = [0 for _ in range(10000)]
y_predicted = predicted_p + predicted_n

recall = recall_score(y_true, y_predicted, average='binary')

print('recall calculated as: %3f' % recall)

recall calculated as: 0.900000


In [12]:
len(docIDs)

4673

In [16]:
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      0.90      0.95       100

    accuracy                           1.00     10100
   macro avg       1.00      0.95      0.97     10100
weighted avg       1.00      1.00      1.00     10100



In [144]:
##############################################
####  query expansion and name entitites  ####
##############################################

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [46]:
from bs4 import BeautifulSoup
import regex as re
stops = set(stopwords.words('english'))

def new_tokenizor(text):
    soup = BeautifulSoup(text, 'html.parser')
    paragraphs = soup.find_all('p')
    cleaned_tokens = []

    for p in paragraphs:
        cleaned_paragraphs = p.get_text()

        ## clean up the texts from numbers, punctuation
        remove = re.sub('<.+?>|\n', '', str(cleaned_paragraphs))
        remove = remove.replace('\\n', '')
        remove = re.sub(r'[^\w\s]|[0-9]', '', remove.strip().lower())

        ## tokenize 
        tokens = word_tokenize(remove)
        cleaned_tokens += [t for t in tokens if t not in stops]
    return cleaned_tokens


In [50]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('maxent_ne_chunker')
stops = set(stopwords.words('english'))
nltk.download('words')
from nltk import pos_tag

for f_name in filename:
    path = "./ueapeople/" + f_name
    f = open(path, "r")
    content = f.read()
    f.close()

    docIDs[nextdocID] = f_name
    posToken = new_tokenizor(content)
    
    ## tag different part of speech
    pos_tagging = pos_tag(posToken) 
    
    ## remove stopword in data
    non_stop = [(pair[0], pair[1]) for pair in pos_tagging if pair[0] not in stops]
#     print(non_stop[:1])

    ## store all noun in new list
    noun = [x for x, tag in non_stop if tag[0:2] == 'NN']
#     print(noun)

    ## grouping of synonymous words from noun
    synsets = [wordnet.synsets(y) for y in noun]
#     print(synsets)
    
    ## lemmatize to root word
    lemmasets = [syn.lemmas() for s in synsets for syn in s]
#     print(lemmasets)

    ## nltk named entites
    for sent in nltk.sent_tokenize(content):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                print(chunk.label(), ''.join(c[0] for c in chunk))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


ORGANIZATION DOCTYPE
ORGANIZATION UniversityofEastAnglia
ORGANIZATION UniversityofEastAnglia
ORGANIZATION OneTrustCookiesConsent
ORGANIZATION OptanonWrapper
ORGANIZATION OneTrustCookiesConsent
PERSON Person
GPE Pharmacy
ORGANIZATION sameAs
GPE Pharmacy
ORGANIZATION sameAs
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON Ganesan
ORGANIZATION ChemicalBiology
ORGANIZATION School
ORGANIZATION Pharmacy
GPE Chemical
ORGANIZATION School
ORGANIZATION Pharmacy
PERSON Phone
PERSON Email
ORGANIZATION Chemistry
PERSON Person
ORGANIZATION subMenu
PERSON Overview
PERSON Network
ORGANIZATION pollTimer
ORGANIZATION pollTimer
PERSON DisplayToast
ORGANIZATION setTimeout
GPE Pure
PERSON Biography
PERSON Click
ORGANIZATION PhD
ORGANIZATION PHA
PERSON Ganesan
ORGANIZATION 

PERSON Pure
GPE EastAnglia
PERSON PurePortal
ORGANIZATION UniversityofEastAnglia
ORGANIZATION ipAddress
ORGANIZATION accessType
PERSON AaronBostrom
ORGANIZATION recordType
ORGANIZATION pageData
ORGANIZATION pageDataTracker
ORGANIZATION DOCTYPE
PERSON Aaron
ORGANIZATION UniversityofEastAnglia
ORGANIZATION UniversityofEastAnglia
PERSON AaronBurgess
ORGANIZATION OneTrustCookiesConsent
ORGANIZATION OptanonWrapper
ORGANIZATION OneTrustCookiesConsent
PERSON Person
PERSON AaronBurgess
PERSON NorwichMedicalSchool
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON AaronBurgess
PERSON AaronBurgess
PERSON ClinicalLecturer
PERSON ClinicalPsychology
PERSON NorwichMedicalSchool
PERSON Email
PERSON Person
ORGANIZATION subMenu
PERSON Overview
PERSON Network
ORGANIZATION

PERSON Pure
GPE EastAnglia
PERSON PurePortal
ORGANIZATION UniversityofEastAnglia
ORGANIZATION ipAddress
ORGANIZATION accessType
PERSON AbassIsiaka
ORGANIZATION recordType
ORGANIZATION pageData
ORGANIZATION pageDataTracker
ORGANIZATION DOCTYPE
ORGANIZATION UniversityofEastAnglia
ORGANIZATION UniversityofEastAnglia
PERSON AbbyKidd
ORGANIZATION OneTrustCookiesConsent
ORGANIZATION OptanonWrapper
ORGANIZATION OneTrustCookiesConsent
PERSON Person
PERSON AbbyKidd
GPE Art
GPE Media
ORGANIZATION AmericanStudies
ORGANIZATION sameAs
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON AbbyKidd
ORGANIZATION Philosophy
ORGANIZATION School
GPE Art
GPE Media
ORGANIZATION AmericanStudies
PERSON Email
ORGANIZATION AMA
PERSON Person
ORGANIZATION subMenu
PERSON Overview
PERS

PERSON Person
PERSON AbdulkarimAlsulami
PERSON NorwichBusinessSchool
ORGANIZATION sameAs
ORGANIZATION BusinessRegulationGroup
ORGANIZATION sameAs
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON AbdulkarimAlsulami
PERSON AbdulkarimAlsulami
ORGANIZATION Philosophy
PERSON NorwichBusinessSchool
PERSON Email
ORGANIZATION NBS
PERSON Person
ORGANIZATION subMenu
PERSON Overview
ORGANIZATION pollTimer
ORGANIZATION pollTimer
PERSON DisplayToast
ORGANIZATION setTimeout
GPE Pure
PERSON Biography
GPE Calibri
GPE SaudiArabia
GPE College
GPE Business
PERSON JeddahUniversity
ORGANIZATION UniversityofEastAnglia
GPE Norwich
ORGANIZATION NBS
PERSON JennyFairbrass
PERSON TiagoBotelho
GPE Calibri
GPE Calibri
GPE Texas
GPE UnitedStates
GPE Calibri
GPE Calibri
GPE Saudi
ORG

PERSON Person
GPE Pharmacy
ORGANIZATION sameAs
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
ORGANIZATION AbdulmajeedAlraies
ORGANIZATION Philosophy
ORGANIZATION School
ORGANIZATION Pharmacy
PERSON Email
ORGANIZATION PHA
PERSON Person
ORGANIZATION subMenu
PERSON Overview
ORGANIZATION pollTimer
ORGANIZATION pollTimer
PERSON DisplayToast
ORGANIZATION setTimeout
GPE Pure
PERSON Share
PERSON Share
PERSON Share
ORGANIZATION LinkedIn
ORGANIZATION plumXError
ORGANIZATION AltMetric
ORGANIZATION PlumX
ORGANIZATION PlumX
ORGANIZATION AltMetric
ORGANIZATION UniversityofEastAngliaLogo
PERSON Scopus
PERSON ElsevierFingerprintEngine
PERSON Elsevier
PERSON Pure
GPE EastAnglia
PERSON PurePortal
ORGANIZATION UniversityofEastAnglia
ORGANIZATION ipAddress
ORGANIZATION access

PERSON Person
PERSON AbhayBagewadi
PERSON NorwichMedicalSchool
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON AbhayBagewadi
PERSON HonorarySeniorLecturer
PERSON NorwichMedicalSchool
PERSON Email
PERSON Person
ORGANIZATION subMenu
PERSON Overview
ORGANIZATION pollTimer
ORGANIZATION pollTimer
PERSON DisplayToast
ORGANIZATION setTimeout
GPE Pure
PERSON Share
PERSON Share
PERSON Share
ORGANIZATION LinkedIn
ORGANIZATION plumXError
ORGANIZATION AltMetric
ORGANIZATION PlumX
ORGANIZATION PlumX
ORGANIZATION AltMetric
ORGANIZATION UniversityofEastAngliaLogo
PERSON Scopus
PERSON ElsevierFingerprintEngine
PERSON Elsevier
PERSON Pure
GPE EastAnglia
PERSON PurePortal
ORGANIZATION UniversityofEastAnglia
ORGANIZATION ipAddress
ORGANIZATION accessType
PERSON AbhayBag

PERSON Person
PERSON NorwichMedicalSchool
PERSON NorwichMedicalSchool
ORGANIZATION GoogleAnalytics
PERSON Skip
PERSON Skip
PERSON Skip
GPE East
ORGANIZATION AngliaHome
ORGANIZATION UniversityofEastAngliaLogo
ORGANIZATION subMenu
GPE Main
PERSON Home
ORGANIZATION Organisations
ORGANIZATION searchForm
ORGANIZATION searchField
ORGANIZATION searchForm
PERSON Abigail
PERSON Abigail
PERSON HonoraryTutor
PERSON NorwichMedicalSchool
PERSON NorwichMedicalSchool
PERSON Email
ORGANIZATION MED
PERSON Person
ORGANIZATION subMenu
PERSON Overview
ORGANIZATION pollTimer
ORGANIZATION pollTimer
PERSON DisplayToast
ORGANIZATION setTimeout
GPE Pure
PERSON Share
PERSON Share
PERSON Share
ORGANIZATION LinkedIn
ORGANIZATION plumXError
ORGANIZATION AltMetric
ORGANIZATION PlumX
ORGANIZATION PlumX
ORGANIZATION AltMetric
ORGANIZATION UniversityofEastAngliaLogo
PERSON Scopus
PERSON ElsevierFingerprintEngine
PERSON Elsevier
PERSON Pure
GPE EastAnglia
PERSON PurePortal
ORGANIZATION UniversityofEastAnglia
ORGANIZATI

KeyboardInterrupt: 

In [139]:
# !pip install -U spacy

In [140]:
# import spacy
# from spacy import displacy
# nlp = spacy.load("en_core_web_sm")

# sentence = "Bulb will become the first energy company to be placed into special administration where it is run by the government through the regulator Ofgem"
# processed = nlp(sentence)

# for pair in processed.ents:
#     print(f"Term: {pair.text} label: {pair.label}")
