**Authors:** <font color=brown>Jozef Hanč & his students</font>  <br>
[Faculty of Science](https://www.upjs.sk/en/faculty-of-science/?prefferedLang=EN) *P.J. Šafárik University in Košice, Slovakia* <br> 
email: [jozef.hanc@upjs.sk](mailto:jozef.hanc@upjs.sk)  
**Document:** Data analysis for AIP publishing house  
**Date**: July 2022

# Scopus <font color =brown> author search </font> <br> via library `pybliometrics`

https://pypi.org/project/pybliometrics/

* Rose, Michael E. and John R. Kitchin: [pybliometrics: Scriptable bibliometrics using a Python interface to Scopus](https://github.com/pybliometrics-dev/pybliometrics/blob/master/meta/1-s2.0-S2352711019300573-main.pdf), SoftwareX 10 (2019) 100263.

## Python functions for Scopus

In [1]:
# Python functions using Pybliometrics
from PybliometricsScopus import *

## Example

In [2]:
# paper title
firstname = 'Marián'
lastname = 'Kireš'

In [3]:
ScopusAuthQuery(firstname, lastname, initial=True)

'AUTHLASTNAME("kires") AND AUTHFIRST(m)'

In [4]:
a = AuthorSearch(ScopusAuthQuery(firstname, lastname, initial=True))
len(a.authors)  # number of authors

1

In [5]:
short = a.authors[0]
show_colwidth(dattr(short,'parameters'))

Unnamed: 0_level_0,value
attribute,Unnamed: 1_level_1
affiliation,Pavol Jozef Safarik University in Kosice
affiliation_id,60031236
areas,COMP (19); PHYS (18); SOCI (14)
city,Kosice
country,Slovakia
documents,28
eid,9-s2.0-23485508800
givenname,Marián
initials,M.
orcid,0000-0003-3857-6382


In [6]:
author = AuthorRetrieval(short.eid)
show_colwidth(dattr(author,'parameters'))

Unnamed: 0_level_0,value
attribute,Unnamed: 1_level_1
affiliation_current,"[(112965765, 60031236, dept, author, None, Faculty of Science, Pavol Jozef Safarik University in Kosice, svk, Slovakia, Srobárova 2, Kosice, Kosic..."
affiliation_history,"[(112965765, 60031236, dept, author, None, Faculty of Science, Pavol Jozef Safarik University in Kosice, svk, Slovakia, Srobárova 2, Kosice, Kosic..."
alias,
citation_count,46
cited_by_count,46
classificationgroup,"[(3100, 15), (1708, 4), (1706, 7), (2208, 2), (3103, 2), (3101, 1), (3304, 14), (1709, 1), (2214, 3), (1705, 7)]"
coauthor_count,30
coauthor_link,http://api.elsevier.com/content/search/author?co-author=23485508800
date_created,"(2008, 2, 21)"
document_count,28


In [7]:
DF(ScopusDataLong(short, author),colname='scopus')

Unnamed: 0,scopus
scopus-id,23485508800
first name,Marián
last name,Kireš
full name,Marián Kireš
affiliation,Pavol Jozef Safarik University in Kosice
location,"Kosice, Slovakia"
areas,COMP (19); PHYS (18); SOCI (14)
documents,28
citations,46
h-index,3


# Scopus for conference

## parameters

In [8]:
# conference = 'DIDSCI+2022'
# da = pd.read_excel('../'+conference+'.xlsx', sheet_name='authors', index_col=0)

In [9]:
da

Unnamed: 0,full name,last name,first name,ascii name
0,Miroslav Almáši,Almáši,Miroslav,m almasi
1,Mária Babinčáková,Babinčáková,Mária,m babincakova
2,Anna Baprowska,Baprowska,Anna,a baprowska
3,Pavel Beneš,Beneš,Pavel,p benes
4,Pawel Bernard,Bernard,Pawel,p bernard
...,...,...,...,...
93,Ľubomír Šnajder,Šnajder,Ľubomír,l snajder
94,Martin Šrámek,Šrámek,Martin,m sramek
95,Alena Šrámová,Šrámová,Alena,a sramova
96,Jitka Štrofová,Štrofová,Jitka,j strofova


## basic search

In [10]:
cols = ['scopus-id', 'first name', 'last name', 'full name', 
        'affiliation', 'location', 'areas', 'documents', 'citations', 'h-index','multiplicity']
ds = pd.DataFrame(columns = cols)

In [11]:
# storage for results
dfs = dict()

In [12]:
# finding scopus data for all authors
morentries = dict()
notfound = []

for idx, row in da.iterrows():
    # choose first initial and last name
    lastname = clean_str(row['last name'], case='lower', unicode=False)
    firstname = clean_str(row['first name'], case='lower', unicode=False)
    try:
        # find an author in Scopus
        a = AuthorSearch(ScopusAuthQuery(firstname, lastname))
        multiplicity = len(a.authors)
        print(f'{idx}, m={multiplicity}, {ScopusAuthQuery(firstname, lastname)}')
        if len(a.authors) > 5:
            morentries[idx] = multiplicity
        else:
            # writing all authors data into ds
            for auth in a.authors:
                short = auth
                author = AuthorRetrieval(short.eid)
                data = ScopusDataLong(short, author)
                data['multiplicity'] = len(a.authors)
                ds = ds.append(data, ignore_index=True)
    except TypeError:
        notfound+=[idx]
        print(f'{idx}, not found, {ScopusAuthQuery(firstname, lastname)}')        

0, m=1, AUTHLASTNAME("almasi") AND AUTHFIRST(miroslav)
1, m=1, AUTHLASTNAME("babincakova") AND AUTHFIRST(maria)
2, not found, AUTHLASTNAME("baprowska") AND AUTHFIRST(anna)
3, m=6, AUTHLASTNAME("benes") AND AUTHFIRST(pavel)
4, m=1, AUTHLASTNAME("bernard") AND AUTHFIRST(pawel)
5, m=1, AUTHLASTNAME("berta") AND AUTHFIRST(tunde)
6, m=3, AUTHLASTNAME("bilek") AND AUTHFIRST(martin)
7, not found, AUTHLASTNAME("borovsky") AND AUTHFIRST(dominik)
8, m=1, AUTHLASTNAME("brestenska") AND AUTHFIRST(beata)
9, m=1, AUTHLASTNAME("bros") AND AUTHFIRST(pawel)
10, m=2, AUTHLASTNAME("burcak") AND AUTHFIRST(m)
11, not found, AUTHLASTNAME("chlebounova") AND AUTHFIRST(irena)
12, m=1, AUTHLASTNAME("chomca") AND AUTHFIRST(i)
13, m=2, AUTHLASTNAME("chroustova") AND AUTHFIRST(katerina)
14, not found, AUTHLASTNAME("coufalova") AND AUTHFIRST(stepanka)
15, m=3, AUTHLASTNAME("danielovic") AND AUTHFIRST(i)
16, not found, AUTHLASTNAME("dinajova") AND AUTHFIRST(j)
17, m=1, AUTHLASTNAME("fancovicova") AND AUTHFIRST(jana)

In [13]:
ds = ds.sort_values(by=['multiplicity','last name'])
ds

Unnamed: 0,scopus-id,first name,last name,full name,affiliation,location,areas,documents,citations,h-index,multiplicity
0,55387563000,Miroslav,Almáši,Miroslav Almáši,Pavol Jozef Safarik University in Kosice,"Kosice, Slovakia",CHEM (49); MATE (20); PHYS (18),50,439,17,1
73,57103742800,Zuzanna,Arki,Zuzanna Arki,Selye János University,"Komarom, Slovakia",SOCI (2),2,1,1,1
1,57214936653,Mária,Babinčáková,Mária Babinčáková,Pavol Jozef Safarik University in Kosice,"Kosice, Slovakia",SOCI (6); COMP (3); CHEM (3),6,43,3,1
2,55711540500,Paweł,Bernard,Paweł Bernard,Uniwersytet Jagielloński,"Krakow, Poland",CHEM (14); SOCI (13); CENG (2),18,105,6,1
3,55386813800,Tünde,Berta,Tünde Berta,Selye János University,"Komarom, Slovakia",MATH (2); SOCI (1); COMP (1),2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
86,57126034000,Martin,Zacek,Martin Zacek,Ceské vysoké ucení technické v Praze,"Prague, Czech Republic",ENGI (2); PHYS (2),1,3,1,5
82,55835862200,Martin,Žáček,Martin Žáček,Division University of Ostrava,"Ostrava, Czech Republic",COMP (27); PHYS (24); MATH (10),49,70,6,5
83,54895378900,Martin,Žáček,Martin Žáček,Academy of Sciences of the Czech Republic,"Prague, Czech Republic",MATE (7); ENGI (4); PHYS (3),6,25,3,5
84,56901949400,Martin,Žáček,Martin Žáček,Institute of Hydrodynamics of the Academy of S...,"Prague, Czech Republic",BIOC (2); MEDI (1),3,2,1,5


In [14]:
ds['query'] = [ScopusAuthQuery(firstname, lastname) for firstname, lastname in ds[['first name','last name']].values]

In [15]:
dfs['found - full name'] = ds

## mutliplicity > 5

In [16]:
dv = da.loc[morentries]
dv['multiplicity'] = [morentries[idx] for idx in dv.index]
dv['query'] = [ScopusAuthQuery(firstname, lastname) for firstname, lastname in dv[['first name','last name']].values]
dv

Unnamed: 0,full name,last name,first name,ascii name,multiplicity,query
3,Pavel Beneš,Beneš,Pavel,p benes,6,"AUTHLASTNAME(""benes"") AND AUTHFIRST(pavel)"
18,Zoltán Fehér,Fehér,Zoltán,z feher,8,"AUTHLASTNAME(""feher"") AND AUTHFIRST(zoltan)"
20,Michael Fuchs,Fuchs,Michael,m fuchs,60,"AUTHLASTNAME(""fuchs"") AND AUTHFIRST(michael)"
29,Jan Hrdlička,Hrdlička,Jan,j hrdlicka,6,"AUTHLASTNAME(""hrdlicka"") AND AUTHFIRST(jan)"
91,Petr Šmejkal,Šmejkal,Petr,p smejkal,6,"AUTHLASTNAME(""smejkal"") AND AUTHFIRST(petr)"
94,Martin Šrámek,Šrámek,Martin,m sramek,7,"AUTHLASTNAME(""sramek"") AND AUTHFIRST(martin)"


In [17]:
dfs['multiplicity > 5'] = dv

## not found

In [18]:
dn = da.loc[notfound]
dns = pd.DataFrame(columns = cols)

In [19]:
# less restrictive searching
moreinicials = dict()
manually = []
for idx, row in dn.iterrows():
    # choose first initial and last name
    lastname = clean_str(row['last name'], case='lower', unicode=False)
    firstname = clean_str(row['first name'], case='lower', unicode=False)
    # find in Scopus
    try:
        query = ScopusAuthQuery(firstname, lastname, initial=True)
        a = AuthorSearch(query)
        multiplicity = len(a.authors)
        print(f'{idx}, m={multiplicity}, {query}')
        if len(a.authors) > 5:
            moreinicials[idx] = multiplicity
        else:
            for auth in a.authors:
                short = auth
                author = AuthorRetrieval(short.eid)
                data = ScopusDataLong(short, author)
                data['multiplicity'] = multiplicity
                dns = dns.append(data, ignore_index=True)
    except TypeError:
        manually +=[idx]
        print(f'{idx}, not found, {query}')

2, not found, AUTHLASTNAME("baprowska") AND AUTHFIRST(a)
7, m=8, AUTHLASTNAME("borovsky") AND AUTHFIRST(d)
11, not found, AUTHLASTNAME("chlebounova") AND AUTHFIRST(i)
14, m=3, AUTHLASTNAME("coufalova") AND AUTHFIRST(s)
16, not found, AUTHLASTNAME("dinajova") AND AUTHFIRST(j)
19, m=1, AUTHLASTNAME("feltl") AND AUTHFIRST(t)
22, not found, AUTHLASTNAME("gorskis") AND AUTHFIRST(m)
28, m=1, AUTHLASTNAME("horniakova") AND AUTHFIRST(m)
30, not found, AUTHLASTNAME("jaruska") AND AUTHFIRST(l)
32, m=3, AUTHLASTNAME("juhasova") AND AUTHFIRST(a)
36, not found, AUTHLASTNAME("kohutiarova") AND AUTHFIRST(v)
39, m=2, AUTHLASTNAME("konopa") AND AUTHFIRST(m)
40, m=4, AUTHLASTNAME("konicek") AND AUTHFIRST(l)
43, m=4, AUTHLASTNAME("kozakova") AND AUTHFIRST(p)
45, not found, AUTHLASTNAME("kricfalusi") AND AUTHFIRST(d)
46, m=3, AUTHLASTNAME("krizanova") AND AUTHFIRST(m)
49, not found, AUTHLASTNAME("krapacek") AND AUTHFIRST(r)
54, not found, AUTHLASTNAME("majzelova") AND AUTHFIRST(l)
55, not found, AUTHLASTN

In [20]:
# found via initials
dns

Unnamed: 0,scopus-id,first name,last name,full name,affiliation,location,areas,documents,citations,h-index,multiplicity
0,35169050700,Stanislava,Coufalová,Stanislava Coufalová,,"None, None",BIOC (3),2,2,1,3
1,6507642349,S.,Coufalová,S. Coufalová,,"None, None",MEDI (1),1,0,0,3
2,55296784200,S.,Coufalova,S. Coufalova,Masarykova Univerzita,"Brno, Czech Republic",MEDI (1),1,0,0,3
3,6505992185,T.,Feltl,T. Feltl,"University of Chemistry and Technology, Prague","Prague, Czech Republic",AGRI (1); BIOC (1),2,40,2,1
4,6603388883,M.,Horniaková,M. Horniaková,Ustav patologickej anatómie LF UK a FN,"Bratislava, Slovakia",MEDI (9),9,0,0,1
5,36241546900,Ana,Juhásová,Ana Juhásová,BIREGAL S.r.o.,"Bratislava, Slovakia",COMP (18); SOCI (6); MATH (1),7,26,3,3
6,56416527300,Anna,Juhásová,Anna Juhásová,Univerzita Komenského v Bratislave,"Bratislava, Slovakia",BIOC (10); CHEM (4); PHAR (2),5,12,2,3
7,57192660823,Andrea,Juhásová,Andrea Juhásová,Constantine the Philosopher University,"Nitra, Slovakia",SOCI (1),1,0,0,3
8,57219601485,Michal,Konopa,Michal Konopa,Jihoceská Univerzita v Ceských Budejovicích,"České Budějovice, Czech Republic",DECI (3); COMP (3); ARTS (1),2,3,1,2
9,57204022884,Maciej,Konopa,Maciej Konopa,Laboratorium Badan Napedow Lotniczych,"Zielonka, Poland",ENGI (1),1,4,1,2


In [21]:
dfs['found - inicial'] = dns

In [22]:
dvi = da.loc[moreinicials]
dvi

Unnamed: 0,full name,last name,first name,ascii name
7,Dominik Borovský,Borovský,Dominik,d borovsky
64,Markéta Píšová,Píšová,Markéta,m pisova


In [23]:
dfs['initials > 5'] = dvi

## not found via initials

In [24]:
dm = da.loc[manually]
dm

Unnamed: 0,full name,last name,first name,ascii name
2,Anna Baprowska,Baprowska,Anna,a baprowska
11,Irena Chlebounová,Chlebounová,Irena,i chlebounova
16,J Dinajová,Dinajová,J,j dinajova
22,Mihails Gorskis,Gorskis,Mihails,m gorskis
30,Ladislav Jaruska,Jaruska,Ladislav,l jaruska
36,Viktória Kohutiarová,Kohutiarová,Viktória,v kohutiarova
45,Dana Kričfaluši,Kričfaluši,Dana,d kricfalusi
49,Richard Křapáček,Křapáček,Richard,r krapacek
54,Lea Majzelová,Majzelová,Lea,l majzelova
55,P Marcinov,Marcinov,P,p marcinov


In [25]:
dfs['manually'] = dm

# Saving

In [26]:
sheetnames = list(dfs.keys())
dataframes = list(dfs.values())

In [27]:
save_xls(dataframes, '../'+conference+'-scopus-AuthorSearch.xlsx', sheetnames)