# Import necessary packages

In [1]:
import pprint

import pandas as pd
import pybliometrics
from pybliometrics.scopus import AuthorRetrieval
import tqdm

# Initialize `pybibliometrics`

##### In order to run this, you will need a Scopus API key from here: https://dev.elsevier.com/

In [2]:
pybliometrics.scopus.init()

# Get random author and see their number of co-authors

In [3]:
# Dr. Clifton D. Fuller
sample_author_ID = 7202433367
sample_author = AuthorRetrieval(sample_author_ID)
print(f'{sample_author.given_name} {sample_author.surname} (ORCID = {sample_author.orcid}, H-index = {sample_author.h_index})')

Clifton David Fuller (ORCID = 0000-0002-5264-3994, H-index = 59)


# Print the number of co-authors

In [4]:
sample_author.coauthor_count

1925

# Get each of the co-authors
### *Note how only 160 at most are returned*

In [5]:
all_sample_coauthors = sample_author.get_coauthors()
print(len(all_sample_coauthors))

160


# One way, we must get all the co-authors the hard way...

##### 1. Go to author profile: https://www.scopus.com/authid/detail.uri?authorId=7202433367
##### 2. Click on the `Documents` tab
##### 3. Click on the `Export All` dropdown
##### 4. Select `CSV`
##### 5. De-Select everything except `Author(s)`
##### 6. Click on `Export`
##### 7. Save the document as `scopus_coauthors.csv` and place in this directory

In [6]:
temp_df = pd.read_csv('scopus_coauthors.csv')
display(temp_df.head())
all_coauthor_IDs = temp_df['Author(s) ID'].tolist()
all_coauthor_IDs = set([int(coauth.strip()) for paper in all_coauthor_IDs for coauth in paper.split(';')])
# Remove the sample author
all_coauthor_IDs.remove(sample_author_ID)
print(len(all_coauthor_IDs))

Unnamed: 0,Authors,Author full names,Author(s) ID,Link
0,Sherry A.D.; Lin T.A.; McCaw Z.R.; Beck E.J.; ...,"Sherry, Alexander D. (57205157045); Lin, Timot...",57205157045; 57202090381; 57190581018; 5826497...,https://www.scopus.com/inward/record.uri?eid=2...
1,Koutroumpakis E.; Mohamed A.S.R.; Chaftari P.;...,"Koutroumpakis, Efstratios (55661655500); Moham...",55661655500; 57206991491; 58662576000; 7403187...,https://www.scopus.com/inward/record.uri?eid=2...
2,Reber B.; Van Dijk L.; Anderson B.; Mohamed A....,"Reber, Brandon (58090241100); Van Dijk, Lisann...",58090241100; 57189041734; 57194441526; 5720699...,https://www.scopus.com/inward/record.uri?eid=2...
3,Wahid K.A.; Cardenas C.E.; Marquez B.; Nethert...,"Wahid, Kareem A. (56531684600); Cardenas, Carl...",56531684600; 57191960444; 57412736600; 5719444...,https://www.scopus.com/inward/record.uri?eid=2...
4,Andrearczyk V.; Oreiller V.; Abobakr M.; Akhav...,"Andrearczyk, Vincent (57190974414); Oreiller, ...",57190974414; 57217480393; 57205445488; 5720793...,https://www.scopus.com/inward/record.uri?eid=2...


1925


# Instead, an easier way, look at all the publications and extract the authors

In [7]:
# Get all publications
all_sample_pubs = sample_author.get_documents()

all_coauthor_IDs = []
for pub in all_sample_pubs:
    for coauthor_id in pub.author_ids.split(';'):
        coauthor_id = int(coauthor_id)
        if coauthor_id not in all_coauthor_IDs:
            all_coauthor_IDs.append(coauthor_id)

print(len(all_coauthor_IDs))

1903


# Get relevant information from the co-authors

In [8]:
all_author_info = []
for temp_scopus_id in tqdm.tqdm(all_coauthor_IDs):
    temp_author = AuthorRetrieval(temp_scopus_id)
    temp_given_name = temp_author.given_name
    temp_indexed_name = temp_author.indexed_name,
    if type(temp_indexed_name) == tuple:
        temp_indexed_name = list(temp_indexed_name)[0]
    try:
        temp_department = temp_author.affiliation_current[0].preferred_name,
        if type(temp_department) == tuple:
            temp_department = list(temp_department)[0]
    except TypeError:
        temp_department = ''
    try:
        temp_institution = temp_author.affiliation_current[0].parent_preferred_name
    except TypeError:
        temp_institution = ''
    if not temp_institution:
        temp_institution = temp_department
    all_author_info.append([
        f'{temp_indexed_name}, {temp_given_name}',
        # temp_department,
        temp_institution
    ])

pprint.pprint(all_author_info[:10])

100%|██████████| 1903/1903 [00:15<00:00, 125.95it/s]

[['Wahid K., Kareem A.', 'The University of Texas MD Anderson Cancer Center'],
 ['Kaffey Z., Zaphanlene Y.',
  'The University of Texas MD Anderson Cancer Center'],
 ['Farris D., David P.', 'The University of Texas MD Anderson Cancer Center'],
 ['Humbert-Vidan L., Laia',
  'The University of Texas MD Anderson Cancer Center'],
 ['Moreno A., Amy Catherine',
  'The University of Texas MD Anderson Cancer Center'],
 ['Rasmussen M., Mathis Ersted', 'Aarhus Universitetshospital'],
 ['Ren J., Jintao', 'Aarhus Universitet'],
 ['Naser M., Mohamed A.', 'The University of Texas MD Anderson Cancer Center'],
 ['Netherton T., Tucker J.',
  'The University of Texas MD Anderson Cancer Center'],
 ['Korreman S., Stine Sofia', 'Aarhus Universitet']]





# Save the results to a CSV file

In [9]:
output_df = pd.DataFrame(
    data = all_author_info,
    columns = ['Last Name, First Name', 'Institution']
)
output_df.to_csv('co-author_affiliations.csv', index=None)