In [1]:
import requests       # send requests to web server
# from bs4 import BeautifulSoup
from lxml import html # parse HTML
import json           # store data as json file
import re             # regular expressions
import os             # access directories
import pandas as pd   # dataframes
from tqdm import tqdm # create progress bar (for i in tqdm(list))
os.chdir('../Data')



## Scrape all URLS

In [2]:
%%time
# Get all article urls (2039 pages)
urls = []
for i in range(1,22):
    path = 'https://ore.exeter.ac.uk/repository/discover?rpp=10&etal=0&group_by=none&page='+str(i)+'&filtertype_0=type&filter_relational_operator_0=equals&filter_0=Article'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    tree.make_links_absolute(path)
    for url in tree.xpath('//a/@href'):
        if "https://ore.exeter.ac.uk/repository/handle/" in url:
            urls.append(url)
                                
articles = pd.DataFrame(urls, columns = ['url'])     
articles['type'] = 'article'  
print(len(urls), "urls were scraped.\n")

20384 urls were scraped.

CPU times: user 2min 36s, sys: 7.33 s, total: 2min 43s
Wall time: 1h 23min 14s


In [3]:
%%time
# Get all book urls (7 pages)
urls = []
for i in range(1,8):
    path = 'https://ore.exeter.ac.uk/repository/discover?rpp=10&etal=0&group_by=none&page='+str(i)+'&filtertype_0=type&filter_relational_operator_0=equals&filter_0=Book'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    tree.make_links_absolute(path)
    for url in tree.xpath('//a/@href'):
        if "https://ore.exeter.ac.uk/repository/handle/" in url:
            urls.append(url)
                                
books = pd.DataFrame(urls, columns = ['url'])     
books['type'] = 'book'
print(len(urls), "urls were scraped.\n")

67 urls were scraped.

CPU times: user 428 ms, sys: 32.1 ms, total: 460 ms
Wall time: 31.7 s


In [4]:
%%time
# Get all book chapter urls (112 pages)
urls = []
for i in range(1,113):
    path = 'https://ore.exeter.ac.uk/repository/discover?rpp=10&etal=0&group_by=none&page='+str(i)+'&filtertype_0=type&filter_relational_operator_0=equals&filter_0=Book+chapter'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    tree.make_links_absolute(path)
    for url in tree.xpath('//a/@href'):
        if "https://ore.exeter.ac.uk/repository/handle/" in url:
            urls.append(url)
                                
bookchapters = pd.DataFrame(urls, columns = ['url'])     
bookchapters['type'] = 'book chapter'
print(len(urls), "urls were scraped.\n")

1112 urls were scraped.

CPU times: user 8.51 s, sys: 405 ms, total: 8.92 s
Wall time: 6min 8s


In [5]:
%%time
# Get all conference paper urls (87 pages)
urls = []
for i in range(1,88):
    path = 'https://ore.exeter.ac.uk/repository/discover?rpp=10&etal=0&group_by=none&page='+str(i)+'&filtertype_0=type&filter_relational_operator_0=equals&filter_0=Conference+paper'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    tree.make_links_absolute(path)
    for url in tree.xpath('//a/@href'):
        if "https://ore.exeter.ac.uk/repository/handle/" in url:
            urls.append(url)
                                
conferencepapers = pd.DataFrame(urls, columns = ['url'])     
conferencepapers['type'] = 'conferencepaper'
print(len(urls), "urls were scraped.\n")

862 urls were scraped.

CPU times: user 5.46 s, sys: 257 ms, total: 5.71 s
Wall time: 3min 55s


In [6]:
%%time
# Get all thesis/dissertation urls (441 pages)
urls = []
for i in range(1,442):
    path = 'https://ore.exeter.ac.uk/repository/discover?rpp=10&etal=0&group_by=none&page='+str(i)+'&filtertype_0=type&filter_relational_operator_0=equals&filter_0=Thesis+or+dissertation'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    tree.make_links_absolute(path)
    for url in tree.xpath('//a/@href'):
        if "https://ore.exeter.ac.uk/repository/handle/" in url:
            urls.append(url)
                                
theses = pd.DataFrame(urls, columns = ['url'])     
theses['type'] = 'thesis/dissertation'
print(len(urls), "urls were scraped.\n")

510 urls were scraped.

CPU times: user 19.6 s, sys: 876 ms, total: 20.4 s
Wall time: 16min 7s


In [7]:
del(urls)

In [8]:
# Takes arouynd 8 minutes per 100 pages
# CPU times: user 7.56 s, sys: 263 ms, total: 7.83 s
# Wall time: 8min 1s

In [17]:
ore = pd.concat([articles, books, bookchapters, conferencepapers, theses])
ore.to_csv("ORE_urls.csv")

## Scrape metadata

In [20]:
ore

Unnamed: 0,url,type
0,https://ore.exeter.ac.uk/repository/handle/108...,article
1,https://ore.exeter.ac.uk/repository/handle/108...,article
2,https://ore.exeter.ac.uk/repository/handle/108...,article
3,https://ore.exeter.ac.uk/repository/handle/108...,article
4,https://ore.exeter.ac.uk/repository/handle/108...,article
...,...,...
505,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation
506,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation
507,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation
508,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation


In [140]:
%%time
college = []
department = []
authors = []
date_accessioned = []
date_issued = []
abstract = []
title = []
DOIs = []

for url in ore.url:
    path = url + '?show=full'
    page = requests.get(path)
    tree = html.fromstring(page.content)
    college.append(tree.xpath('/html/body/div[1]/div/ul/li[2]/a/text()')[0])
    department.append(tree.xpath('/html/body/div[1]/div/ul/li[3]/a/text()')[0])
    author_list = []
    date_accessioned_switch = 0
    date_issued_switch = 0
    title_list_switch = 0
    abstract_list_switch = 0
    doi_list_switch = 0
    for i in range(0, len(tree.xpath('//*[@class="label-cell"]/text()'))):
        if tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.contributor.author":
            author_list.append(tree.xpath('//td[@class="word-break"]/text()')[i])
        if date_accessioned_switch == 0 and tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.date.accessioned":
            date_accessioned.append(tree.xpath('//td[@class="word-break"]/text()')[i])
            date_accessioned_switch = 1
        if date_issued_switch == 0 and tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.date.issued":
            date_issued.append(tree.xpath('//td[@class="word-break"]/text()')[i])
            date_issued_switch = 1
        if title_list_switch == 0 and tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.title":
            title.append(tree.xpath('//td[@class="word-break"]/text()')[i])
            title_list_switch = 1
        if abstract_list_switch == 0 and tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.description.abstract":
            abstract.append(tree.xpath('//td[@class="word-break"]/text()')[i])
            abstract_list_switch = 1
        if doi_list_switch == 0 and tree.xpath('//*[@class="label-cell"]/text()')[i] == "dc.identifier.doi":
            DOIs.append(tree.xpath('//td[@class="word-break"]/text()')[i])
            doi_list_switch = 1
    authors.append(author_list)
    if date_accessioned_switch == 0:
        date_accessioned.append("")
    if date_issued_switch == 0:
        date_issued.append("")
    if title_list_switch == 0:
        title.append("")
    if abstract_list_switch == 0:
        abstract.append("")
    if doi_list_switch == 0:
        DOIs.append("")

CPU times: user 29min 31s, sys: 42.9 s, total: 30min 14s
Wall time: 6h 43min 35s


In [None]:
# 1.52 min for 66 urls
# CPU times: user 7.23 s, sys: 167 ms, total: 7.4 s
# Wall time: 1min 52s
# >> Might take 8 hours for the full 22935 urls

In [141]:
df = pd.concat([ore.reset_index(drop=True), 
                pd.DataFrame(list(zip(college, department, authors, date_accessioned, date_issued, title, abstract, doi)),
                             columns=['college', 'department', 'authors', 'date_accessioned', 'date_issued', 'title', 'abstract', 'doi'])] , axis = 1)
df

Unnamed: 0,url,type,college,department,authors,date_accessioned,date_issued,title,abstract
0,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Medicine and Health,Institute of Biomedical & Clinical Science,"[Beall, C, Hanna, L, Ellacott, KLJ]",2017-04-21T12:08:26Z,2017-09-12,CNS targets of adipokines,Our understanding of adipose tissue as an endo...
1,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Life and Environmental Sciences,Biosciences,"[Paris, JR, Stevens, JR, Catchen, JM]",2017-05-02T12:56:27Z,2017-04-18,Lost in parameter space: A road map for Stacks,1.Restriction site-Associated DNA sequencing (...
2,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Life and Environmental Sciences,Psychology,"[Alsubaie, M, Abbbott, R, Dunn, BD, Dickens, C...",2017-04-25T07:20:32Z,2017-04-23,Mechanisms of action in mindfulness-based cogn...,"Background\r\n\r\nRecently, there has been an ..."
3,https://ore.exeter.ac.uk/repository/handle/108...,article,"College of Engineering, Mathematics and Physic...",Mathematics,"[Betterton, RT, Broad, LM, Tsaneva-Atanasova, ...",2017-04-24T08:18:21Z,2017-03-12,Acetylcholine modulates gamma frequency oscill...,Modulation of gamma oscillations is important ...
4,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Social Sciences and International S...,Politics,"[Stokes, D, waterman, K]",2017-04-24T09:52:00Z,2017-06-12,Beyond balancing? Intrastate conflict and US g...,Grand strategic theorists share an historical ...
...,...,...,...,...,...,...,...,...,...
22930,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"[Micheli, Leonardo]",2015-10-19T07:44:00Z,2015-04-30,Enhancing Electrical and Heat Transfer Perform...,In a world that is constantly in need of a con...
22931,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"[Morgenroth, Thekla]",2015-10-19T07:40:40Z,2015-06-26,How Role Models Affect Role Aspirants’ Motivat...,Role models are often suggested as a means of ...
22932,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"[Speidl, Bianka Ágnes]",2015-09-29T08:56:09Z,2015-03-02,Conceptualisation of Power in the Thought of M...,The topic of my research is the Shi'i jurist M...
22933,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"[Osborne, Joe M.]",2015-10-14T09:27:11Z,2015-05-08,Understanding Northern Hemisphere land precipi...,Water is key to life on Earth. The distributio...


In [17]:
df.to_csv("ORE_data.csv", index = False)
# Last saved on 2022-04-05 (with DOIs)