In [29]:
import re
import string
import requests
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import time

In [30]:
#get robots.txt file
robots = requests.get('https://pureportal.coventry.ac.uk/robots.txt')
robots = BeautifulSoup(robots.text)

robots

<html><body><p>User-Agent: *
Crawl-Delay: 1
Disallow: /*?*format=rss
Disallow: /*?*export=xls
Sitemap: https://pureportal.coventry.ac.uk/sitemap.xml</p></body></html>

In [31]:
#find the end page of the publications on which to search
html = requests.get("https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=0")
soup = BeautifulSoup(html.text)

for item in soup.find_all('a', {'class':'step'}):
    #iterates through an ends at the last step found in the class (thus the last page)
    x = item.text
    

In [32]:
#set page to 0 and create an empty list of links
page = 0
link = []

#iterate through the pages on publications between page 0 and the max page found above
while page >=0 and page <= int(x):
    url = f"https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page={page}"
    #Wait 1 second (per robots.txt)
    time.sleep(1.0)
    print('Processing page number:',page)
    r = requests.get(url)
    html = r.content
    #create a soup that contains all the html from the current page
    soup = BeautifulSoup(html, "html.parser")
    
    #iterate through the soup to find all 'a' references
    for i in soup.find('ul', {'class':'list-results'}).find_all('a'):
        #scrape all web links from this and append to the list "link"
        i['href'] = i['href']
        link.append(i['href'])
    #Repeat for the next page
    
    page = page + 1

Processing page number: 0
Processing page number: 1
Processing page number: 2
Processing page number: 3
Processing page number: 4
Processing page number: 5
Processing page number: 6
Processing page number: 7
Processing page number: 8
Processing page number: 9
Processing page number: 10
Processing page number: 11
Processing page number: 12
Processing page number: 13


In [33]:
#Create a list of just the publication links
publications = [x for x in link if "publications" in x]

#Create a list of just the people profile links
persons = [x for x in link if "persons" in x]

In [34]:
#Create an empty list of the documents to search if it does not exist already
while True:
    try:
        documents
        break
    except NameError:
        documents = []
        
#Create an empty list of the titles if it does not exist already
while True:
    try:
        titles
        break
    except NameError:
        titles = []
        
#Create an empty list of all authors of a publication if it does not exist already
while True:
    try:
        authors
        break
    except NameError:
        authors = []
        
#Create an empty list of the author profiles if it does not exist already
while True:
    try:
        author_profile
        break
    except NameError:
        author_profile = []
        
#Create an empty list of the publication dates if it does not exist already
while True:
    try:
        dates
        break
    except NameError:
        dates = []
        
#Create the Library DataFrame if it does not exist
while True:
    try:
        Library
        break
    except NameError:
        Library = pd.DataFrame(list(zip(titles,authors,dates,publications,author_profile,documents)), 
                               columns=['Title',
                                        'Authors',
                                        'Publication Date',
                                        'Web Link',
                                        'Author Profile',
                                        'Overview'])


#iterate over the publications list 
for n,i in enumerate(publications):
    
    #Create a skip if the publication is already contained in the Library
    if i not in list(Library['Web Link']):
        print(n, 'Added Publication:',i)
        r = requests.get(i)
        #Wait 1 second (per robots.txt)
        time.sleep(1.0)
        soup = BeautifulSoup(r.content, 'html.parser')

        #create a blank list of the overview / blurbs from the publications
        blurb = []
        #find the relevant class (if it does not exist then skip)
        if soup.find('div', {'class':'textblock'}) is not None:
            #append each blurb to the blurb list
            for i in soup.find_all('div', {'class':'textblock'}):
                #print(n)
                blurb.append(i.text)
            #append all the blurbs to the documents list
            documents.append(' '.join(blurb))
        else: 
            print(n,'Added blank Overview field')
            #add N/A if blank
            documents.append(' '.join('N/A'))


        #create a blank list of the titles from the publications
        t = []
        #find the relevant class (if it does not exist then skip)
        if soup.find('div', {'class':'rendering'}) is not None:
            #append each title to the t list
            for i in soup.find('div', {'class':'rendering'}):
                #print(n)
                t.append(i.text)
            #append all the titles to the titles list
            titles.append(' '.join(t))
        else: 
            print(n,'Added blank Title field')
            #add N/A if blank
            titles.append(' '.join('N/A'))


        #create a blank list of the authors from the publications
        autha = []
        #find the relevant class (if it does not exist then skip)
        if soup.find_all('p',{'class':'relations persons'}) is not None:
            #append each author to the autha list
            for i in soup.find_all('p',{'class':'relations persons'}):
                #print(n)
                autha.append(i.text)
            #append all the authors to the authors list
            authors.append(' '.join(autha))
        else: 
            print(n,'Added blank Author field')
            #add N/A if blank
            authors.append(' '.join('N/A'))


        #create a blank list of the author profiles from the publications
        auth = []
        #find the relevant class (if it does not exist then skip)
        if soup.find_all('a',{'class':'link person'}) is not None:
            #append each author profile to the auth list
            for i in soup.find_all('a',{'class':'link person'}):
                #print(n)
                i['href'] = i['href']
                auth.append(i['href'])
            #append all the author profiles to the author_profile list
            author_profile.append(' '.join(auth))
        else: 
            print(n,'Added blank Author Profile field')
            #add N/A if blank
            author_profile.append(' '.join('N/A'))


        #create a blank list of the dates of the publications
        d = []
        #find the relevant class (if it does not exist then skip)
        if soup.find('span',{'class':'date'}) is not None:
            #append each date to the d list
            for i in soup.find('span',{'class':'date'}):
                #print(n)
                d.append(i.text)
            #append all the publication dates to the dates list
            dates.append(' '.join(d))
        else: 
            print(n,'Added blank Publication Date field')
            #add N/A if blank
            dates.append(' '.join('N/A'))
    
    
        

633 Added Publication: https://pureportal.coventry.ac.uk/en/publications/evaluation-of-clinical-interventions-effectiveness-efficiency-and-2
633 Added blank Overview field
634 Added Publication: https://pureportal.coventry.ac.uk/en/publications/market-orientation-in-the-uk-higher-education-sector-the-influenc-2
634 Added blank Overview field
635 Added Publication: https://pureportal.coventry.ac.uk/en/publications/developing-a-comprehensive-cross-country-economic-growth-database-2
636 Added Publication: https://pureportal.coventry.ac.uk/en/publications/measuring-value-added-in-higher-education-a-proposal-2
636 Added blank Overview field
637 Added Publication: https://pureportal.coventry.ac.uk/en/publications/modelling-optimal-plant-size-and-market-equilibria-under-differen-2
637 Added blank Overview field


In [35]:
for i in (titles,authors,dates,publications,author_profile,documents):
    print (len(i))

638
638
638
638
638
638


In [36]:
#Recreate the Library (Data Frame)with all the relevant information
Library = pd.DataFrame(list(zip(titles,authors,dates,publications,author_profile,documents)), 
                            columns=['Title',
                                     'Authors',
                                     'Publication Date',
                                     'Web Link',
                                     'Author Profile',
                                     'Overview'])

In [37]:
#Create a function for cleaning the query later
def clean_words(words):
    #Remove any Unicode
    temp_words = re.sub(r'[^\x00-\x7F]+', ' ', words)
    #Lowercase all words
    temp_words = temp_words.lower()
    #Remove all punctuation
    temp_words = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', temp_words)
    #Remove any digits
    temp_words = re.sub(r'[0-9]', '', temp_words)
    
    return temp_words

In [38]:
# Clean the documents
documents_clean = []
for d in documents:
    #Clean the documents using the same steps as the function above
    temp_words = re.sub(r'[^\x00-\x7F]+', ' ', d)
    #Lowercase all words
    temp_words = temp_words.lower()
    #Remove all punctuation
    temp_words = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', temp_words)
    #Remove any digits
    temp_words = re.sub(r'[0-9]', '', temp_words)
    #append the cleaned document to the documents_clean list
    documents_clean.append(temp_words)


In [39]:
S = set(stopwords.words('english'))
ps = PorterStemmer()

def remove_stopwords(tokens):
    tokens_stop_removed = []
    for token in tokens.split():
        token = ps.stem(token)
        if not token.lower() in S:
            tokens_stop_removed.append(token)
    return tokens_stop_removed


In [40]:
tokenized = []
for d in documents_clean:
    document = remove_stopwords(d)
    tokenized.append(document)

In [41]:
final = []
for token in tokenized:
    final.append(TreebankWordDetokenizer().detokenize(token))

In [42]:
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(final)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
df.tail()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,628,629,630,631,632,633,634,635,636,637
york,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zeitraum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
def get_similar_articles(q, df):
    
    #Use clean_words function to clean query
    q = clean_words(q)
    
    #Remove stopwords from query
    q = remove_stopwords(q)
    
    #De-tokenize cleaned query so it is a full string for comparison
    q = TreebankWordDetokenizer().detokenize(q)
    
    #Make q a list
    q = [q]
    
    #Vectorise the query
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    
    #Create an empty dictionary for the similarities
    #Compare the query to the data TF-IDF DataFrame
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)

        sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)

    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity Score:", v)
            print(Library.iloc[k])
            print()


In [46]:
#Run function with input from the user
get_similar_articles(input(), df)

print('END OF RESULTS')

ToUgh FinanciaL ClImates and WeAthEr
Similarity Score: 0.025527672221976383
Title               CEO Financial Experience and Firms’ Earnings M...
Authors             Thai Nguyen, Thang Nguyen, Panagiotis Andrikop...
Publication Date                                           7 Mar 2022
Web Link            https://pureportal.coventry.ac.uk/en/publicati...
Author Profile      https://pureportal.coventry.ac.uk/en/persons/t...
Overview            This study investigates the impact of CEOs’ fi...
Name: 4, dtype: object

Similarity Score: 0.00846204124568563
Title               Bank stock valuation theories: do they explain...
Authors             Ken Yien Leong, Mohamed Ariff, Alireza Zarei, ...
Publication Date                                           1 Mar 2022
Web Link            https://pureportal.coventry.ac.uk/en/publicati...
Author Profile      https://pureportal.coventry.ac.uk/en/persons/a...
Overview            PurposeThe objective of this paper is to inves...
Name: 2, dtype: object