In [1]:
import re             # regular expressions
import os             # access directories
import pandas as pd   # dataframes
from tqdm import tqdm # create progress bar (for i in tqdm(list))
os.chdir('../Data')

## Load data

In [2]:
staff = pd.read_csv('UoE_staff.csv')
staff

Unnamed: 0,college,department,name,role,profile,namelast,namefirst,identifier1,identifier2
0,The Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"Bateman, I","Bateman, Ian"
1,The Business School,Economics,Professor Giuseppe Cavaliere,Distinguished Research Professor,https://business-school.exeter.ac.uk/about/peo...,Cavaliere,Giuseppe,"Cavaliere, G","Cavaliere, Giuseppe"
2,The Business School,Economics,Professor Surajeet Chakravarty,"Associate Professor in Economics, Director of ...",https://business-school.exeter.ac.uk/about/peo...,Chakravarty,Surajeet,"Chakravarty, S","Chakravarty, Surajeet"
3,The Business School,Economics,Professor Carlos Cortinhas,Associate Professor of Economics,https://business-school.exeter.ac.uk/about/peo...,Cortinhas,Carlos,"Cortinhas, C","Cortinhas, Carlos"
4,The Business School,Economics,Professor James Davidson,Emeritus Professor of Econometrics,https://business-school.exeter.ac.uk/about/peo...,Davidson,James,"Davidson, J","Davidson, James"
...,...,...,...,...,...,...,...,...,...
4135,College of Social Sciences and International S...,Strategy and Security,Tobias Borck,PhD Student,http://eprofile.exeter.ac.uk/tobiasborck,Borck,Tobias,"Borck, T","Borck, Tobias"
4136,College of Social Sciences and International S...,Strategy and Security,Maria Chiara Slucca,PhD Student,https://eprofile.exeter.ac.uk/mariachiaraslucca/,Slucca,Maria,"Slucca, M","Slucca, Maria"
4137,College of Social Sciences and International S...,Strategy and Security,Leanne Fuller,PhD Student,https://eprofile.exeter.ac.uk/leannefuller/,Fuller,Leanne,"Fuller, L","Fuller, Leanne"
4138,College of Social Sciences and International S...,Strategy and Security,Salem Osseiran,PhD Student,https://eprofile.exeter.ac.uk/salemosseiran/,Osseiran,Salem,"Osseiran, S","Osseiran, Salem"


In [3]:
# Prepare data for matching
staff.identifier1 = staff.identifier1.str.lower()
staff.identifier2 = staff.identifier2.str.lower()
staff.college[staff.college == 'The Business School'] = 'Business School'

In [4]:
ore = pd.read_csv('ORE_data.csv')
ore['identifiers'] = ore.authors.map(lambda x: x.lower())
ore

Unnamed: 0,url,type,college,department,authors,date_accessioned,date_issued,title,abstract,doi,identifiers
0,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Medicine and Health,Institute of Biomedical & Clinical Science,"['Beall, C', 'Hanna, L', 'Ellacott, KLJ']",2017-04-21T12:08:26Z,2017-09-12,CNS targets of adipokines,Our understanding of adipose tissue as an endo...,10.1002/cphy.c160045,"['beall, c', 'hanna, l', 'ellacott, klj']"
1,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Life and Environmental Sciences,Biosciences,"['Paris, JR', 'Stevens, JR', 'Catchen, JM']",2017-05-02T12:56:27Z,2017-04-18,Lost in parameter space: A road map for Stacks,1.Restriction site-Associated DNA sequencing (...,10.1111/2041-210X.12775,"['paris, jr', 'stevens, jr', 'catchen, jm']"
2,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Life and Environmental Sciences,Psychology,"['Alsubaie, M', 'Abbbott, R', 'Dunn, BD', 'Dic...",2017-04-25T07:20:32Z,2017-04-23,Mechanisms of action in mindfulness-based cogn...,"Background\r\n\r\nRecently, there has been an ...",http://doi.org/10.1016/j.cpr.2017.04.008,"['alsubaie, m', 'abbbott, r', 'dunn, bd', 'dic..."
3,https://ore.exeter.ac.uk/repository/handle/108...,article,"College of Engineering, Mathematics and Physic...",Mathematics,"['Betterton, RT', 'Broad, LM', 'Tsaneva-Atanas...",2017-04-24T08:18:21Z,2017-03-12,Acetylcholine modulates gamma frequency oscill...,Modulation of gamma oscillations is important ...,10.1111/ejn.13582,"['betterton, rt', 'broad, lm', 'tsaneva-atanas..."
4,https://ore.exeter.ac.uk/repository/handle/108...,article,College of Social Sciences and International S...,Politics,"['Stokes, D', 'waterman, K']",2017-04-24T09:52:00Z,2017-06-12,Beyond balancing? Intrastate conflict and US g...,Grand strategic theorists share an historical ...,http://dx.doi.org/10.1080/01402390.2017.1330682,"['stokes, d', 'waterman, k']"
...,...,...,...,...,...,...,...,...,...,...,...
22930,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"['Micheli, Leonardo']",2015-10-19T07:44:00Z,2015-04-30,Enhancing Electrical and Heat Transfer Perform...,In a world that is constantly in need of a con...,,"['micheli, leonardo']"
22931,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"['Morgenroth, Thekla']",2015-10-19T07:40:40Z,2015-06-26,How Role Models Affect Role Aspirants’ Motivat...,Role models are often suggested as a means of ...,,"['morgenroth, thekla']"
22932,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"['Speidl, Bianka Ágnes']",2015-09-29T08:56:09Z,2015-03-02,Conceptualisation of Power in the Thought of M...,The topic of my research is the Shi'i jurist M...,,"['speidl, bianka ágnes']"
22933,https://ore.exeter.ac.uk/repository/handle/108...,thesis/dissertation,Doctoral College,Doctoral Theses,"['Osborne, Joe M.']",2015-10-14T09:27:11Z,2015-05-08,Understanding Northern Hemisphere land precipi...,Water is key to life on Earth. The distributio...,,"['osborne, joe m.']"


In [5]:
# Check proportion of missing information in the ORE data
(ore.isnull().sum()*100/len(ore)).round(1)

url                  0.0
type                 0.0
college              0.0
department           0.0
authors              0.0
date_accessioned     0.0
date_issued          0.0
title                0.0
abstract             5.5
doi                 14.4
identifiers          0.0
dtype: float64

In [6]:
# Check which colleges in the ORE data do not match the staff data
for col in ore.college.unique():
    if not col in staff.college.unique():
     print(col)

Other Collections
Doctoral College
Special Collections


In [7]:
# Check which department in the ORE data do not match the staff data
for dep in ore.department.unique():
    if not dep in staff.department.unique():
     print(dep)

Institute of Biomedical & Clinical Science
Institute of Health Research
Management Studies
Sociology, Philosophy & Anthropology
Russian
Organisation Studies
Accounting
Chinese
German
Institute of Medical Education
Portuguese
Italian
French
Hispanic Studies
Medical Imaging
Professional Services
European Centre for Environment & Human Health
University of Exeter Journals
Collections of Former Colleges
Conferences@Exeter
Doctoral Theses
De-placing Future Memory Research Project
Arab World Documentation Unit
MbyRes Dissertations
Bill Douglas Cinema Museum
Open Exeter Project
MPhil Dissertations


In [8]:
for dep in staff.department.unique():
    if not dep in ore.department.unique():
     print(dep)

Finance and Accounting
Management
Sustainable Futures
Business School PGR
EMPS
Renewable Energy
Mathematics Exeter (Mods & Progs Only)
Mathematics Penryn (Mods & Progs Only)
Art History & Visual Culture
Film Studies
Liberal Arts
Modern Languages
Medicine and Health
Sociology, Philosophy and Anthropology
Strategy and Security


## Match staff members to metadata

In [9]:
# %%time
# # Match by department
# authors = []
# dates = []
# titles = []
# abstracts = []
# urls = []
# fields = []
# publications = []

# for i, name in enumerate(staff.name):
#     authors_temp = []
#     dates_temp = []
#     titles_temp = []
#     abstracts_temp = []
#     url_temp = []
#     field_temp = []
#     count = 0
#     for j, title in enumerate(ore.title):
#         if ((staff.identifier1[i] in ore.identifiers[j]) or (staff.identifier2[i] in ore.identifiers[j])) and staff.college[i] in ore.college[j]:
#             authors_temp.append(ore.authors[j])
#             dates_temp.append(ore.date_issued[j])
#             titles_temp.append(ore.title[j])
#             abstracts_temp.append(ore.abstract[j])
#             url_temp.append(ore.url[j])
#             field_temp.append(ore.department[j])
#             count +=1
#     publications.append(count)
#     authors.append(authors_temp)
#     dates.append(dates_temp)
#     titles.append(titles_temp)
#     abstracts.append(abstracts_temp)
#     urls.append(url_temp) 
#     fields.append(field_temp)

In [9]:
%%time
# Match by name and college
authors = []
dates = []
titles = []
abstracts = []
urls = []
fields = []
types = []
publications = []

for i, name in enumerate(staff.name):
    authors_temp = []
    dates_temp = []
    titles_temp = []
    abstracts_temp = []
    url_temp = []
    field_temp = []
    type_temp = []
    count = 0
    for j, title in enumerate(ore.title):
        if ((staff.identifier1[i] in ore.identifiers[j]) or (staff.identifier2[i] in ore.identifiers[j])) and (staff.college[i] in ore.college[j] or ore.college[j] == "Doctoral College"):
            authors_temp.append(ore.authors[j])
            dates_temp.append(ore.date_issued[j])
            titles_temp.append(ore.title[j])
            abstracts_temp.append(ore.abstract[j])
            url_temp.append(ore.url[j])
            field_temp.append(ore.department[j])
            type_temp.append(ore.type[j])
            count +=1
    publications.append(count)
    authors.append(authors_temp)
    dates.append(dates_temp)
    titles.append(titles_temp)
    abstracts.append(abstracts_temp)
    urls.append(url_temp) 
    fields.append(field_temp)
    types.append(type_temp)

CPU times: user 1h 33min 51s, sys: 4.92 s, total: 1h 33min 56s
Wall time: 1h 33min 59s


In [10]:
df = pd.concat([staff.reset_index(drop=True), 
                pd.DataFrame(list(zip(publications, authors, dates, titles, abstracts, urls, fields, types)),
                             columns=['publications', 'authors', 'dates', 'titles', 'abstracts', 'urls', 'fields', 'types'])] , axis = 1)
df

Unnamed: 0,college,department,name,role,profile,namelast,namefirst,identifier1,identifier2,publications,authors,dates,titles,abstracts,urls,fields,types
0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"[['Fezzi, Carlo', 'Bateman, IJ'], ['Bateman, I...","[2015-02-04, 2013-10-25, 2014-01-08, 2013-07-0...",[The Impact of Climate Change on Agriculture: ...,[Ricardian (hedonic) analyses of the impact of...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Economics, Economics, Economics, Economics, E...","[article, article, article, article, article, ..."
1,Business School,Economics,Professor Giuseppe Cavaliere,Distinguished Research Professor,https://business-school.exeter.ac.uk/about/peo...,Cavaliere,Giuseppe,"cavaliere, g","cavaliere, giuseppe",8,"[['Boswijk, HO', 'Cavaliere, G', 'Georgiev, I'...","[2021-03-04, 2020-09-15, 2020-11-30, 2018-06-1...",[Bootstrapping non-stationary stochastic volat...,[In this paper we investigate to what extent t...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Economics, Economics, Economics, Economics, E...","[article, article, article, article, article, ..."
2,Business School,Economics,Professor Surajeet Chakravarty,"Associate Professor in Economics, Director of ...",https://business-school.exeter.ac.uk/about/peo...,Chakravarty,Surajeet,"chakravarty, s","chakravarty, surajeet",12,"[['Chakravarty, S', 'Fonseca, MA', 'Ghosh, S',...","[2016-10-21, 2016-11-01, 2016-03-16, 2015-01, ...","[Religious fragmentation, social identity and ...",[We examine the impact of religious identity a...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Economics, Economics, Economics, Economics, E...","[article, article, article, article, article, ..."
3,Business School,Economics,Professor Carlos Cortinhas,Associate Professor of Economics,https://business-school.exeter.ac.uk/about/peo...,Cortinhas,Carlos,"cortinhas, c","cortinhas, carlos",1,"[['Cortinhas, Carlos']]",[2007-04],[Intra-industry trade and business cycles in A...,[A new resolve for both increased economic int...,[https://ore.exeter.ac.uk/repository/handle/10...,[Economics],[article]
4,Business School,Economics,Professor James Davidson,Emeritus Professor of Econometrics,https://business-school.exeter.ac.uk/about/peo...,Davidson,James,"davidson, j","davidson, james",25,"[['Davidson, James', 'Rambaccussing, Dooruj'],...","[2015-05-30, 2016-09-01, 2015-09-05, 2000-10, ...",[A test of the long memory hypothesis based on...,[This paper develops a new test of true versus...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Economics, Economics, Economics, Economics, E...","[article, article, article, article, article, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4135,College of Social Sciences and International S...,Strategy and Security,Tobias Borck,PhD Student,http://eprofile.exeter.ac.uk/tobiasborck,Borck,Tobias,"borck, t","borck, tobias",0,[],[],[],[],[],[],[]
4136,College of Social Sciences and International S...,Strategy and Security,Maria Chiara Slucca,PhD Student,https://eprofile.exeter.ac.uk/mariachiaraslucca/,Slucca,Maria,"slucca, m","slucca, maria",0,[],[],[],[],[],[],[]
4137,College of Social Sciences and International S...,Strategy and Security,Leanne Fuller,PhD Student,https://eprofile.exeter.ac.uk/leannefuller/,Fuller,Leanne,"fuller, l","fuller, leanne",0,[],[],[],[],[],[],[]
4138,College of Social Sciences and International S...,Strategy and Security,Salem Osseiran,PhD Student,https://eprofile.exeter.ac.uk/salemosseiran/,Osseiran,Salem,"osseiran, s","osseiran, salem",0,[],[],[],[],[],[],[]


In [16]:
# Manually fix wrong matches for authors with 40+ publications

# # Ali A
# df.loc[1243,'authors':'types'] = df.loc[1243,'authors':'types'].apply(lambda x: list())
# df.loc[1243,'publications'] = len(df.loc[1243,'authors'])

# # Rod and Robert Taylor
# taylor_robert = []
# taylor_rod = []
# for i, authors in enumerate(df.loc[3245]['authors']):
#     if len(re.findall("Taylor, Robert|Taylor, R'|Taylor, RW", authors)) != 0:
#         taylor_robert.append(i)
#     if len(re.findall("Taylor, Rod S|Taylor, RS", authors)) != 0:
#         taylor_rod.append(i)
# df.loc[3245,'authors':'types'] = df.loc[3245,'authors':'types'].apply(lambda x: [x[i] for i in taylor_rod])
# df.loc[3245,'publications'] = len(df.loc[3245,'authors'])
# df.loc[3246,'authors':'types'] = df.loc[3246,'authors':'types'].apply(lambda x: [x[i] for i in taylor_robert])
# df.loc[3246,'publications'] = len(df.loc[3246,'authors'])

# Yang, Yaan, and Yinchen Liu
# liu_yang = []
# liu_yaan = []
# for i, authors in enumerate(df.loc[443]['authors']):
#     # All of Yaan Liu's articles are coauthored with Ghita
#     if len(re.findall("Ghita", authors)) != 0:
#         liu_yaan.append(i)
#     else:
#         liu_yang.append(i)
# df.loc[443,'authors':'types'] = df.loc[443,'authors':'types'].apply(lambda x: [x[i] for i in liu_yang])
# df.loc[443,'publications'] = len(df.loc[443,'authors'])
# df.loc[737,'authors':'types'] = df.loc[737,'authors':'types'].apply(lambda x: [x[i] for i in liu_yaan])
# df.loc[737,'publications'] = len(df.loc[737,'authors'])
# df.loc[1525,'authors':'types'] = df.loc[1525,'authors':'types'].apply(lambda x: list())
# df.loc[1525,'publications'] = len(df.loc[1525,'authors'])

# # Craig and C. David Wright
# wright_c_david = []
# for i, authors in enumerate(df.loc[477]['authors']):
#     if len(re.findall("Wright, C. David|Wright, CD|Wright, C'|David Wright, C", authors)) != 0:
#         wright_c_david.append(i)
# df.loc[477,'authors':'types'] = df.loc[477,'authors':'types'].apply(lambda x: [x[i] for i in wright_c_david])
# df.loc[477,'publications'] = len(df.loc[477,'authors'])
# df.loc[1065,'authors':'types'] = df.loc[1065,'authors':'types'].apply(lambda x: list())
# df.loc[1065,'publications'] = len(df.loc[1065,'authors'])

# # Hani Y
# df.loc[2827,'authors':'types'] = df.loc[2827,'authors':'types'].apply(lambda x: list())
# df.loc[2827,'publications'] = len(df.loc[2827,'authors'])

# # Xinqi and Xiang Li
# df.loc[675,'authors':'types'] = df.loc[675,'authors':'types'].apply(lambda x: list())
# df.loc[675,'publications'] = len(df.loc[675,'authors'])
# df.loc[1524,'authors':'types'] = df.loc[1524,'authors':'types'].apply(lambda x: list())
# df.loc[1524,'publications'] = len(df.loc[1524,'authors'])

# # Yuxiao, Yi, and Ying Wang
# wang_ying = []
# for i, authors in enumerate(df.loc[762]['authors']):
#     # All of Ying Wang's articles are coauthored with Dai, K
#     if len(re.findall("Dai, K", authors)) != 0:
#         wang_ying.append(i)     
# df.loc[762,'authors':'types'] = df.loc[762,'authors':'types'].apply(lambda x: [x[i] for i in wang_ying])
# df.loc[762,'publications'] = len(df.loc[762,'authors'])
# df.loc[1224,'authors':'types'] = df.loc[1224,'authors':'types'].apply(lambda x: list())
# df.loc[1224,'publications'] = len(df.loc[1224,'authors'])
# df.loc[1353,'authors':'types'] = df.loc[1353,'authors':'types'].apply(lambda x: list())
# df.loc[1353,'publications'] = len(df.loc[1353,'authors'])

# Yujia Zhu
# df.loc[1080,'authors':'types'] = df.loc[1080,'authors':'types'].apply(lambda x: list())
# df.loc[1080,'publications'] = len(df.loc[1080,'authors'])

# # Sarah Bell and Sian de Bell
# bell_sarah = []
# debell_sian = []
# for i, authors in enumerate(df.loc[2855]['authors']):
#     if len(re.findall("Bell, SL|Bell, Sarah L|^Bell, S", authors)) != 0:
#         bell_sarah.append(i)
#     elif len(re.findall("de Bell", authors)) != 0:
#         debell_sian.append(i)
# df.loc[2855,'authors':'types'] = df.loc[2855,'authors':'types'].apply(lambda x: [x[i] for i in bell_sarah])
# df.loc[2855,'publications'] = len(df.loc[2855,'authors'])
# df.loc[2919,'authors':'types'] = df.loc[2919,'authors':'types'].apply(lambda x: [x[i] for i in debell_sian])
# df.loc[2919,'publications'] = len(df.loc[2919,'authors'])

# # Rebecca and Richard Smith
# smith_rebecca = []
# smith_richard = []
# for i, authors in enumerate(df.loc[3224]['authors']):
#     if len(re.findall("Smith, RG|Smith, Rebecca", authors)) != 0:
#         smith_rebecca.append(i)
#     elif len(re.findall("Smith, R'|Smith, RD", authors)) != 0:
#         smith_richard.append(i)
# df.loc[3224,'authors':'types'] = df.loc[3224,'authors':'types'].apply(lambda x: [x[i] for i in smith_rebecca])
# df.loc[3224,'publications'] = len(df.loc[3224,'authors'])
# df.loc[3225,'authors':'types'] = df.loc[3225,'authors':'types'].apply(lambda x: [x[i] for i in smith_richard])
# df.loc[3225,'publications'] = len(df.loc[3225,'authors'])

In [None]:
# Manually fix wrong matches for authors with 30+ publications

# Abigail and Anne-Marie Russell
russell_abigail = []
russell_anne = []
for i, authors in enumerate(df.loc[3201]['authors']):
    if len(re.findall("Russell, AM|Russell, A-M", authors)) != 0:
        russell_anne.append(i)
    else:
        russell_abigail.append(i)
df.loc[3201,'authors':'types'] = df.loc[3201,'authors':'types'].apply(lambda x: [x[i] for i in russell_abigail])
df.loc[3201,'publications'] = len(df.loc[3201,'authors'])
df.loc[3202,'authors':'types'] = df.loc[3202,'authors':'types'].apply(lambda x: [x[i] for i in russell_anne])
df.loc[3202,'publications'] = len(df.loc[3202,'authors'])

# Ali A
df.loc[1243,'authors':'types'] = df.loc[1243,'authors':'types'].apply(lambda x: list())
df.loc[1243,'publications'] = len(df.loc[1243,'authors'])

# Aimee and Anna Murray
murray_aimee = []
murray_anna = []
for i, authors in enumerate(df.loc[3125]['authors']):
    if len(re.findall("Murray, AK|Bendall, R", authors)) != 0:
        murray_aimee.append(i)
    else:
        murray_anna.append(i)
df.loc[3125,'authors':'types'] = df.loc[3125,'authors':'types'].apply(lambda x: [x[i] for i in murray_aimee])
df.loc[3125,'publications'] = len(df.loc[3125,'authors'])
df.loc[3126,'authors':'types'] = df.loc[3126,'authors':'types'].apply(lambda x: [x[i] for i in murray_anna])
df.loc[3126,'publications'] = len(df.loc[3126,'authors'])

# Chao and Chi Zhang
df.loc[1074,'authors':'types'] = df.loc[1074,'authors':'types'].apply(lambda x: list())
df.loc[1074,'publications'] = len(df.loc[1074,'authors'])
df.loc[1239,'authors':'types'] = df.loc[1239,'authors':'types'].apply(lambda x: list())
df.loc[1239,'publications'] = len(df.loc[1239,'authors'])

# Craig and C. David Wright
wright_c_david = []
for i, authors in enumerate(df.loc[477]['authors']):
    if len(re.findall("Wright, C. David|Wright, CD|Wright, C'|David Wright, C", authors)) != 0:
        wright_c_david.append(i)
df.loc[477,'authors':'types'] = df.loc[477,'authors':'types'].apply(lambda x: [x[i] for i in wright_c_david])
df.loc[477,'publications'] = len(df.loc[477,'authors'])
df.loc[1065,'authors':'types'] = df.loc[1065,'authors':'types'].apply(lambda x: list())
df.loc[1065,'publications'] = len(df.loc[1065,'authors'])

# Daniel and Derek Partridge
partridge_daniel = []
partridge_derek = []
for i, authors in enumerate(df.loc[540]['authors']):
    if len(re.findall("Partridge, Derek", authors)) != 0:
        partridge_derek.append(i)
    else:
        partridge_daniel.append(i)
df.loc[540,'authors':'types'] = df.loc[540,'authors':'types'].apply(lambda x: [x[i] for i in partridge_daniel])
df.loc[540,'publications'] = len(df.loc[540,'authors'])
df.loc[1569,'authors':'types'] = df.loc[1569,'authors':'types'].apply(lambda x: [x[i] for i in partridge_derek])
df.loc[1569,'publications'] = len(df.loc[1569,'authors'])

# Hani Y
df.loc[2827,'authors':'types'] = df.loc[2827,'authors':'types'].apply(lambda x: list())
df.loc[2827,'publications'] = len(df.loc[2827,'authors'])

# Junhao, Jingyi and Jin Wang
wang_jin = []
for i, titles in enumerate(df.loc[1059]['titles']):
    if len(re.findall("Data Augmentation|Learning", titles)) != 0:
        wang_jin.append(i)
df.loc[1059,'authors':'types'] = df.loc[1059,'authors':'types'].apply(lambda x: [x[i] for i in wang_jin])
df.loc[1059,'publications'] = len(df.loc[1059,'authors'])
df.loc[1108,'authors':'types'] = df.loc[1108,'authors':'types'].apply(lambda x: list())
df.loc[1108,'publications'] = len(df.loc[1108,'authors'])
df.loc[1225,'authors':'types'] = df.loc[1225,'authors':'types'].apply(lambda x: list())
df.loc[1225,'publications'] = len(df.loc[1225,'authors'])

# Katrina and Kerry Brown
brown_katrina = []
brown_kerry = []
junk = []
for i, authors in enumerate(df.loc[2485]['authors']):
    if len(re.findall("Adger|Baggio|Chaigneau|Galafassi|Huke|Morrison|Quinn|Schoon", authors)) != 0 or authors == "['Brown, K']":
        brown_katrina.append(i)
    elif len(re.findall("Venkateshmurthy", authors)) != 0:
        brown_kerry.append(i)
df.loc[2485,'authors':'types'] = df.loc[2485,'authors':'types'].apply(lambda x: [x[i] for i in brown_katrina])
df.loc[2485,'publications'] = len(df.loc[2485,'authors'])
df.loc[2698,'authors':'types'] = df.loc[2698,'authors':'types'].apply(lambda x: [x[i] for i in brown_kerry])
df.loc[2698,'publications'] = len(df.loc[2698,'authors'])

# Rebecca and Richard Smith
smith_rebecca = []
smith_richard = []
for i, authors in enumerate(df.loc[3224]['authors']):
    if len(re.findall("Smith, RG|Smith, Rebecca", authors)) != 0:
        smith_rebecca.append(i)
    elif len(re.findall("Smith, R'|Smith, RD", authors)) != 0:
        smith_richard.append(i)
df.loc[3224,'authors':'types'] = df.loc[3224,'authors':'types'].apply(lambda x: [x[i] for i in smith_rebecca])
df.loc[3224,'publications'] = len(df.loc[3224,'authors'])
df.loc[3225,'authors':'types'] = df.loc[3225,'authors':'types'].apply(lambda x: [x[i] for i in smith_richard])
df.loc[3225,'publications'] = len(df.loc[3225,'authors'])

# Rod and Robert Taylor
taylor_robert = []
taylor_rod = []
for i, authors in enumerate(df.loc[3245]['authors']):
    if len(re.findall("Taylor, Robert|Taylor, R'|Taylor, RW", authors)) != 0:
        taylor_robert.append(i)
    if len(re.findall("Taylor, Rod S|Taylor, RS", authors)) != 0:
        taylor_rod.append(i)
df.loc[3245,'authors':'types'] = df.loc[3245,'authors':'types'].apply(lambda x: [x[i] for i in taylor_rod])
df.loc[3245,'publications'] = len(df.loc[3245,'authors'])
df.loc[3246,'authors':'types'] = df.loc[3246,'authors':'types'].apply(lambda x: [x[i] for i in taylor_robert])
df.loc[3246,'publications'] = len(df.loc[3246,'authors'])

# Sarah Bell and Sian de Bell
bell_sarah = []
debell_sian = []
for i, authors in enumerate(df.loc[2855]['authors']):
    if len(re.findall("Bell, SL|Bell, Sarah L|^Bell, S", authors)) != 0:
        bell_sarah.append(i)
    elif len(re.findall("de Bell", authors)) != 0:
        debell_sian.append(i)
df.loc[2855,'authors':'types'] = df.loc[2855,'authors':'types'].apply(lambda x: [x[i] for i in bell_sarah])
df.loc[2855,'publications'] = len(df.loc[2855,'authors'])
df.loc[2919,'authors':'types'] = df.loc[2919,'authors':'types'].apply(lambda x: [x[i] for i in debell_sian])
df.loc[2919,'publications'] = len(df.loc[2919,'authors'])

# Sigong Zhang
df.loc[767,'authors':'types'] = df.loc[767,'authors':'types'].apply(lambda x: list())
df.loc[767,'publications'] = len(df.loc[767,'authors'])

# Yang, Yaan, and Yinchen Liu
liu_yang = []
liu_yaan = []
for i, authors in enumerate(df.loc[443]['authors']):
    # All of Yaan Liu's articles are coauthored with Ghita
    if len(re.findall("Ghita", authors)) != 0:
        liu_yaan.append(i)
    else:
        liu_yang.append(i)
df.loc[443,'authors':'types'] = df.loc[443,'authors':'types'].apply(lambda x: [x[i] for i in liu_yang])
df.loc[443,'publications'] = len(df.loc[443,'authors'])
df.loc[737,'authors':'types'] = df.loc[737,'authors':'types'].apply(lambda x: [x[i] for i in liu_yaan])
df.loc[737,'publications'] = len(df.loc[737,'authors'])
df.loc[1525,'authors':'types'] = df.loc[1525,'authors':'types'].apply(lambda x: list())
df.loc[1525,'publications'] = len(df.loc[1525,'authors'])

# Yujia Zhu
df.loc[1080,'authors':'types'] = df.loc[1080,'authors':'types'].apply(lambda x: list())
df.loc[1080,'publications'] = len(df.loc[1080,'authors'])

# Yuxiao, Yi, and Ying Wang
wang_ying = []
for i, authors in enumerate(df.loc[762]['authors']):
    # All of Ying Wang's articles are coauthored with Dai, K
    if len(re.findall("Dai, K", authors)) != 0:
        wang_ying.append(i)     
df.loc[762,'authors':'types'] = df.loc[762,'authors':'types'].apply(lambda x: [x[i] for i in wang_ying])
df.loc[762,'publications'] = len(df.loc[762,'authors'])
df.loc[1224,'authors':'types'] = df.loc[1224,'authors':'types'].apply(lambda x: list())
df.loc[1224,'publications'] = len(df.loc[1224,'authors'])
df.loc[1353,'authors':'types'] = df.loc[1353,'authors':'types'].apply(lambda x: list())
df.loc[1353,'publications'] = len(df.loc[1353,'authors'])

# Xinqi and Xiang Li
df.loc[675,'authors':'types'] = df.loc[675,'authors':'types'].apply(lambda x: list())
df.loc[675,'publications'] = len(df.loc[675,'authors'])
df.loc[1524,'authors':'types'] = df.loc[1524,'authors':'types'].apply(lambda x: list())
df.loc[1524,'publications'] = len(df.loc[1524,'authors'])

In [246]:
# A lot of duplicates remain, but these are too many to fix manually
df[(df.duplicated(subset=['namelast', 'publications'], keep = False)) & (df.publications>0) ].sort_values(by=['publications', 'namelast'])

Unnamed: 0,college,department,name,role,profile,namelast,namefirst,identifier1,identifier2,publications,authors,dates,titles,abstracts,urls,fields,types
1082,"College of Engineering, Mathematics and Physic...",EMPS,Manal Safar G Alghamdi,,https://emps.exeter.ac.uk/staff/mmma201,Alghamdi,Manal,"alghamdi, m","alghamdi, manal",1,"[['Nasir, SNFM', 'Ullah, H', 'Mutalib, MA', 'S...",[2021-12-07],[WTa37O95.487 Nanocatalyst for Pollutant Degra...,[The release of toxic industrial effluents has...,[https://ore.exeter.ac.uk/repository/handle/10...,[Engineering],[article]
1365,"College of Engineering, Mathematics and Physic...",Physics and Astronomy,Mashael Saeed S Alghamdi,Postgraduate,https://emps.exeter.ac.uk/physics-astronomy/st...,Alghamdi,Mashael,"alghamdi, m","alghamdi, mashael",1,"[['Nasir, SNFM', 'Ullah, H', 'Mutalib, MA', 'S...",[2021-12-07],[WTa37O95.487 Nanocatalyst for Pollutant Degra...,[The release of toxic industrial effluents has...,[https://ore.exeter.ac.uk/repository/handle/10...,[Engineering],[article]
696,"College of Engineering, Mathematics and Physic...",Engineering,Priscila Alves,Postdoctoral Research Associate,https://emps.exeter.ac.uk/engineering/staff/pa361,Alves,Priscila,"alves, p","alves, priscila",1,"[['Barros Ramalho Alves, P', 'Alves Rufino, IA...",[2020-01-16],[Land-Use and Legislation-Based Methodology fo...,"[In developing countries, the urbanisation pro...",[https://ore.exeter.ac.uk/repository/handle/10...,[Engineering],[article]
1122,"College of Engineering, Mathematics and Physic...",Engineering,Priscila Barros Ramalho Alves,Postgraduate Researcher,https://emps.exeter.ac.uk/engineering/staff/pr327,Alves,Priscila,"alves, p","alves, priscila",1,"[['Barros Ramalho Alves, P', 'Alves Rufino, IA...",[2020-01-16],[Land-Use and Legislation-Based Methodology fo...,"[In developing countries, the urbanisation pro...",[https://ore.exeter.ac.uk/repository/handle/10...,[Engineering],[article]
2292,College of Life and Environmental Sciences,Biosciences,Dr David Baker,Research Fellow,https://biosciences.exeter.ac.uk/staff/profile...,Baker,David,"baker, d","baker, david",1,"[['Turchin, P', 'Currie, TE', 'Whitehouse, H',...",[2018-01-09],[Quantitative historical analysis uncovers a s...,[Do human societies from around the world exhi...,[https://ore.exeter.ac.uk/repository/handle/10...,[Biosciences],[article]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,College of Humanities,Classics and Ancient History,Professor Neville Morley,,https://humanities.exeter.ac.uk/classics/staff...,Morley,Neville,"morley, n","morley, neville",25,"[['Morley, NDG'], ['Morley, NDG'], ['Morley, N...","[2017-11-01, 2017-09-27, 2020, 2020-02-21, 201...",[History Can't Always Help to Make Sense of th...,"[nan, This piece is a response to Coyne’s Dest...",[https://ore.exeter.ac.uk/repository/handle/10...,"[Classics and Ancient History, Classics and An...","[article, article, article, article, article, ..."
465,"College of Engineering, Mathematics and Physic...",Engineering,Professor Christopher Smith,Professor of Mechanical Engineering,https://emps.exeter.ac.uk/engineering/staff/cw...,Smith,Christopher,"smith, c","smith, christopher",26,"[['Amundsen, DS', 'Mayne, NJ', 'Baraffe, I', '...","[2016-08-26, 2015-11-15, 2015-01-01, 2018-01-0...",[The UK Met Office GCM with a sophisticated ra...,[To study the complexity of hot Jupiter atmosp...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Physics and Astronomy, Computer Science, Engi...","[article, article, article, article, article, ..."
3225,College of Medicine and Health,Medicine and Health,Professor Richard Smith,Deputy Pro-Vice Chancellor and Professor of He...,https://medicine.exeter.ac.uk/people/profile/i...,Smith,Richard,"smith, r","smith, richard",26,"[['Reis, S', 'Steinle, S', 'Morris, G', 'Flemi...","[2013-10-04, 2016-06-06, 2017-10-16, 2018-04-2...",[Integrating Health & Environmental Impact Ana...,[Scientific investigations have progressively ...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Institute of Health Research, Institute of Bi...","[article, article, article, article, article, ..."
838,"College of Engineering, Mathematics and Physic...",Mathematics,Dr Chen Wang,Postdoctoral Research Fellow,https://emps.exeter.ac.uk/mathematics/staff/cw880,Wang,Chen,"wang, c","wang, chen",30,"[['Zheng, G', 'Chu, C', 'Belavý, DL', 'Ibragim...","[2016-08-17, 2017-05-11, 2016-06, 2016-09-07, ...",[Evaluation and comparison of 3D intervertebra...,[The evaluation of changes in Intervertebral D...,[https://ore.exeter.ac.uk/repository/handle/10...,"[Physics and Astronomy, Computer Science, Engi...","[article, article, article, article, article, ..."


In [296]:
# Explode the data
df["id"] = df.index
df_long = df.set_index(['id']).apply(pd.Series.explode).reset_index()
df_long.authors = df_long.authors.str.strip('[]')
df_long

Unnamed: 0,id,college,department,name,role,profile,namelast,namefirst,identifier1,identifier2,publications,authors,dates,titles,abstracts,urls,fields,types
0,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Fezzi, Carlo', 'Bateman, IJ'",2015-02-04,The Impact of Climate Change on Agriculture: N...,Ricardian (hedonic) analyses of the impact of ...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article
1,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Harwood, Amii R.', 'Mace, Geor...",2013-10-25,Ecosystem services: response,,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article
2,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Agarwala, M', ""Bad'ura, T""",2014-01-08,Pollinator declines: Avoid pitfalls of consens...,,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article
3,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Harwood, Amii R.', 'Mace, Geor...",2013-07-05,Bringing ecosystem services into economic deci...,Landscapes generate a wide range of valuable e...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article
4,0,Business School,Economics,Professor Ian Bateman,"Professor of Environmental Economics, Director...",https://business-school.exeter.ac.uk/about/peo...,Bateman,Ian,"bateman, i","bateman, ian",40,"'Bateman, IJ', 'Agarwala, M', 'Binner, A', 'Co...",2016-06-22,Spatially explicit integrated modeling and eco...,We present an integrated model of the direct c...,https://ore.exeter.ac.uk/repository/handle/108...,Economics,article
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31137,4135,College of Social Sciences and International S...,Strategy and Security,Tobias Borck,PhD Student,http://eprofile.exeter.ac.uk/tobiasborck,Borck,Tobias,"borck, t","borck, tobias",0,,,,,,,
31138,4136,College of Social Sciences and International S...,Strategy and Security,Maria Chiara Slucca,PhD Student,https://eprofile.exeter.ac.uk/mariachiaraslucca/,Slucca,Maria,"slucca, m","slucca, maria",0,,,,,,,
31139,4137,College of Social Sciences and International S...,Strategy and Security,Leanne Fuller,PhD Student,https://eprofile.exeter.ac.uk/leannefuller/,Fuller,Leanne,"fuller, l","fuller, leanne",0,,,,,,,
31140,4138,College of Social Sciences and International S...,Strategy and Security,Salem Osseiran,PhD Student,https://eprofile.exeter.ac.uk/salemosseiran/,Osseiran,Salem,"osseiran, s","osseiran, salem",0,,,,,,,


In [298]:
# Save/load data
df_long.to_csv('UoE_staff_publications_theses.csv', index = False)