This notebook is dedicated to the retrieval of data from the Scopus API. The cross-reference of data from Web of Science and Scopus will allow the the unambiguation of authors and better quality control. 

In [24]:
import pandas as pd
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import CitationOverview
from tqdm import tqdm

# Retraction Watch

In [25]:
rw = pd.read_excel('./retractions_data/retraction_watch_database.xlsx')
rw.head()

Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes
0,47271,Binding of DCC by Netrin-1 to Mediate Axon Gui...,(BLS) Biology - Cellular;(BLS) Biology - Gener...,Departments of Anatomy and of Biochemistry and...,Science,American Association for the Advancement of Sc...,United States,Elke Stein;Yimin Zou;Mu-ming Poo;Marc Tessier-...,https://retractionwatch.com/2023/08/31/stanfor...,Research Article;,2023-08-31 00:00:00,10.1126/science.adk1521,0.0,2001-03-09 00:00:00,10.1126/science.1059391,11239160.0,Retraction,+Investigation by Company/Institution;+Manipul...,No,
1,47270,Hierarchical Organization of Guidance Receptor...,(BLS) Biochemistry;(BLS) Biology - General;(BL...,Department of Anatomy and Department of Bioche...,Science,American Association for the Advancement of Sc...,United States,Elke Stein;Marc Tessier-Lavigne,https://retractionwatch.com/2023/08/31/stanfor...,Research Article;,2023-08-31 00:00:00,10.1126/science.adk1517,0.0,2001-02-08 00:00:00,10.1126/science.1058445,11239147.0,Retraction,+Duplication of Image;+Investigation by Compan...,No,
2,47243,Therapeutic potential of targeting IRES-depend...,(BLS) Biochemistry;(BLS) Biology - Cancer;(BLS...,"Division of Hematology-Oncology, UCLA-Greater ...",Oncogene,Springer - Nature Publishing Group,United States,Y Shi;Y Yang;C Bardeleben;B Holmes;J Gera;Alan...,,Research Article;,2023-08-31 00:00:00,10.1038/s41388-023-02820-5,0.0,2015-05-11 00:00:00,10.1038/onc.2015.156,25961916.0,Retraction,+Concerns/Issues About Data;+Concerns/Issues A...,No,see also: https://pubpeer.com/publications/704...
3,47233,A classifier based on 273 urinary peptides pre...,(BLS) Biochemistry;(HSC) Medicine - Cardiovasc...,"Department of Nephrology, The Third Affiliated...",Journal of Hypertension,Wolters Kluwer - Lippincott Williams & Wilkins,China,Lirong Lin;Chunxuan Wang;Jiangwen Ren;Mei Mei;...,,Research Article;,2023-08-30 00:00:00,10.1097/HJH.0000000000003551,37642599.0,2023-08-01 00:00:00,10.1097/HJH.0000000000003467,37199562.0,Retraction,+Concerns/Issues About Results;+Investigation ...,No,see also https://journals.lww.com/jhypertensio...
4,47227,"Age, Gender Demographics and Comorbidity Preva...",(HSC) Biostatistics/Epidemiology;(HSC) Medicin...,"Department of Orthopaedics, Dhanalakshmi Srini...",Journal of Coastal Life Medicine,Journal of Coastal Life Medicine,India,S Venkatesh Kumar;Mohith Singh;Gowtham Singh;K...,,Research Article;,2023-08-30 00:00:00,unavailable,0.0,2023-01-01 00:00:00,unavailable,0.0,Retraction,+Notice - Lack of;+Withdrawal;,No,"date of retraction unknown, article title repl..."


In [26]:
rw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42700 entries, 0 to 42699
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Record ID              42700 non-null  int64  
 1   Title                  42700 non-null  object 
 2   Subject                42700 non-null  object 
 3   Institution            42699 non-null  object 
 4   Journal                42700 non-null  object 
 5   Publisher              42700 non-null  object 
 6   Country                42700 non-null  object 
 7   Author                 42700 non-null  object 
 8   URLS                   21687 non-null  object 
 9   ArticleType            42700 non-null  object 
 10  RetractionDate         42700 non-null  object 
 11  RetractionDOI          42209 non-null  object 
 12  RetractionPubMedID     37599 non-null  float64
 13  OriginalPaperDate      42700 non-null  object 
 14  OriginalPaperDOI       40173 non-null  object 
 15  Or

In [27]:
rw['RetractionDate'] = pd.to_datetime(rw['RetractionDate'], errors='coerce') #, infer_datetime_format=True
rw['OriginalPaperDate'] = pd.to_datetime(rw['OriginalPaperDate'])

# Journals to consider

In [28]:
journals = pd.read_csv('../scimagojr_2022.csv', sep=';')
journals

Unnamed: 0,Rank,Sourceid,Title,Type,Issn,SJR,SJR Best Quartile,H index,Total Docs. (2022),Total Docs. (3years),...,Total Cites (3years),Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Region,Publisher,Coverage,Categories,Areas
0,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",86091,Q1,198,44,118,...,30318,85,29999,9700,United States,Northern America,Wiley-Blackwell,1950-2022,Hematology (Q1); Oncology (Q1),Medicine
1,2,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",36730,Q1,292,36,122,...,2141,122,1483,6661,United Kingdom,Western Europe,Oxford University Press,1886-2022,Economics and Econometrics (Q1),"Economics, Econometrics and Finance"
2,3,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",34201,Q1,485,121,328,...,13331,156,3547,8929,United Kingdom,Western Europe,Nature Publishing Group,2000-2022,Cell Biology (Q1); Molecular Biology (Q1),"Biochemistry, Genetics and Molecular Biology"
3,4,18434,Cell,journal,"00928674, 10974172",26494,Q1,856,420,1637,...,67791,1440,4380,6574,United States,Northern America,Cell Press,1974-2022,"Biochemistry, Genetics and Molecular Biology (...","Biochemistry, Genetics and Molecular Biology"
4,5,15847,New England Journal of Medicine,journal,"00284793, 15334406",26015,Q1,1130,1410,4561,...,133956,1854,3393,1021,United States,Northern America,Massachussetts Medical Society,1945-2022,Medicine (miscellaneous) (Q1),Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18031,18032,17500154901,Progress in Molecular Biology and Translationa...,journal,"18771173, 18780814",,-,110,90,314,...,1170,0,272,10888,Netherlands,Western Europe,Academic Press Inc.,2008-2022,Molecular Biology; Molecular Medicine,"Biochemistry, Genetics and Molecular Biology"
18032,18033,25192,Reviews of Environmental Contamination and Tox...,journal,01795953,,-,94,22,99,...,441,0,440,15168,United States,Northern America,Springer New York,1987-2022,"Health, Toxicology and Mutagenesis; Medicine (...",Environmental Science; Medicine
18033,18034,5700155185,Voprosy Istorii (discontinued),journal,00428779,,-,5,168,1316,...,38,1316,003,796,Russian Federation,Eastern Europe,"Rossiiskaya Akademiya Nauk, Institut Istorii (...","1965, 1972, 1975, 1978-1982, 1985, 1988, 1999-...",History; Medicine (miscellaneous),Arts and Humanities; Medicine
18034,18035,21100873483,Wisdom (discontinued),journal,18293824,,-,7,66,180,...,76,180,045,2561,Armenia,Eastern Europe,Khachatur Abovyan Armenian State Pedagogical U...,2018-2022,Philosophy,Arts and Humanities


In [29]:
# Calculate the threshold for the top 10%
threshold = int(0.10 * len(journals))

# Select the top 10% of rows
top_10_percent = journals.head(threshold)
top_10_percent

Unnamed: 0,Rank,Sourceid,Title,Type,Issn,SJR,SJR Best Quartile,H index,Total Docs. (2022),Total Docs. (3years),...,Total Cites (3years),Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Region,Publisher,Coverage,Categories,Areas
0,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",86091,Q1,198,44,118,...,30318,85,29999,9700,United States,Northern America,Wiley-Blackwell,1950-2022,Hematology (Q1); Oncology (Q1),Medicine
1,2,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",36730,Q1,292,36,122,...,2141,122,1483,6661,United Kingdom,Western Europe,Oxford University Press,1886-2022,Economics and Econometrics (Q1),"Economics, Econometrics and Finance"
2,3,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",34201,Q1,485,121,328,...,13331,156,3547,8929,United Kingdom,Western Europe,Nature Publishing Group,2000-2022,Cell Biology (Q1); Molecular Biology (Q1),"Biochemistry, Genetics and Molecular Biology"
3,4,18434,Cell,journal,"00928674, 10974172",26494,Q1,856,420,1637,...,67791,1440,4380,6574,United States,Northern America,Cell Press,1974-2022,"Biochemistry, Genetics and Molecular Biology (...","Biochemistry, Genetics and Molecular Biology"
4,5,15847,New England Journal of Medicine,journal,"00284793, 15334406",26015,Q1,1130,1410,4561,...,133956,1854,3393,1021,United States,Northern America,Massachussetts Medical Society,1945-2022,Medicine (miscellaneous) (Q1),Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1798,1799,19900192175,Journal of Topology,journal,"17538416, 17538424",1575,Q1,28,56,119,...,137,119,111,3668,United Kingdom,Western Europe,John Wiley and Sons Ltd,2010-2022,Geometry and Topology (Q1),Mathematics
1799,1800,21100875643,Materials Chemistry Frontiers,journal,20521537,1575,Q1,70,367,1245,...,8782,1233,721,6392,United Kingdom,Western Europe,Royal Society of Chemistry,2017-2022,Materials Chemistry (Q1); Materials Science (m...,Materials Science
1800,1801,15061,Agricultural Systems,journal,"0308521X, 18732267",1574,Q1,126,166,607,...,4408,601,702,7249,United Kingdom,Western Europe,Elsevier BV,1976-2022,Agronomy and Crop Science (Q1); Animal Science...,Agricultural and Biological Sciences
1801,1802,26112,Frontiers of Hormone Research,journal,03013073,1574,Q1,42,0,45,...,156,41,092,000,Switzerland,Western Europe,S. Karger AG,"1975, 1977, 1984, 1996-1997, 1999-2002, 2004-2...","Endocrinology (Q1); Endocrinology, Diabetes an...","Biochemistry, Genetics and Molecular Biology; ..."


In [30]:
top_10_percent['Issn']

0       15424863, 00079235
1       00335533, 15314650
2       14710072, 14710080
3       00928674, 10974172
4       00284793, 15334406
               ...        
1798    17538416, 17538424
1799              20521537
1800    0308521X, 18732267
1801              03013073
1802    14617307, 13505076
Name: Issn, Length: 1803, dtype: object

In [31]:
top_10_percent[top_10_percent['Issn'].str.contains(',')]['Issn']

0       15424863, 00079235
1       00335533, 15314650
2       14710072, 14710080
3       00928674, 10974172
4       00284793, 15334406
               ...        
1795    15565653, 00150282
1796    19360533, 19360541
1798    17538416, 17538424
1800    0308521X, 18732267
1802    14617307, 13505076
Name: Issn, Length: 1191, dtype: object

In [32]:
top_10_percent[top_10_percent['Issn'].str.count(',') > 1]['Issn']

143     15383598, 00987484, 00029955
263     03785912, 01662236, 1878108X
304     13624326, 03765067, 09680004
1545    08203946, 00084409, 14882329
Name: Issn, dtype: object

In [33]:
journals = top_10_percent['Issn'][0:100].values
journals

array(['15424863, 00079235', '00335533, 15314650', '14710072, 14710080',
       '00928674, 10974172', '00284793, 15334406', '1546170X, 10788956',
       '10575987, 15458601', '15461696, 10870156', '20588437', '00028282',
       '1474175X, 14741768', '14764687, 00280836', '00223808, 1537534X',
       '00346861, 15390756', '20587546', '14710056, 14710064',
       '14741784, 14741776', '15206890, 00092665', '14741741, 14741733',
       '01492195, 1545861X', '10614036, 15461718', '00018392, 19303815',
       '19416520, 19416067', '00221082, 15406261', '15453278, 07320582',
       '15458636, 15460738', '03060012, 14604744', '17594782, 17594774',
       '10974180, 10747613', '01406736, 1474547X', '00346527, 1467937X',
       '15487091, 15487105', '17238617', '1553877X', '14764660, 14761122',
       '19358245, 19358237', '15221210, 00319333', '10959203, 00368075',
       '25201158', '17483387, 17483395', '00129682, 14680262', '00220515',
       '19457790, 19457782', '15356108, 18783686', '175

# Scopus Search

In [34]:
def scopus_search(start_journal, end_journal):
    df = pd.DataFrame()
    journals = top_10_percent['Issn'][start_journal:end_journal].values
    
    for issn in tqdm(journals): 
        issn_list = issn.replace(" ", "").split(',')
        comma_count = issn.count(',')
        if comma_count == 0:
            s = ScopusSearch(f'ISSN ( "{issn_list[0]}" ) AND ( LIMIT-TO ( DOCTYPE , "tb" ) )') 
        elif comma_count == 1:
            s = ScopusSearch(f'(ISSN ( "{issn_list[0]}" ) OR ISSN ( "{issn_list[1]}" ))  AND ( LIMIT-TO ( DOCTYPE , "tb" ) )') 
        else:
            s = ScopusSearch(f'(ISSN ( "{issn_list[0]}" ) OR ISSN ( "{issn_list[1]}" ) OR ISSN ( "{issn_list[2]}" ))  AND ( LIMIT-TO ( DOCTYPE , "tb" ) )') 
        df = pd.concat([df, pd.DataFrame(s.results)], ignore_index=True)
    
    return df

# OR ISSN ( "17538424" )

In [36]:
data1 = scopus_search(0, 10)

  0%|          | 0/10 [00:00<?, ?it/s]


Scopus429Error: 

In [None]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156996 entries, 0 to 156995
Data columns (total 36 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   eid                  156996 non-null  object
 1   doi                  145325 non-null  object
 2   pii                  43889 non-null   object
 3   pubmed_id            124395 non-null  object
 4   title                156994 non-null  object
 5   subtype              156996 non-null  object
 6   subtypeDescription   156996 non-null  object
 7   creator              145510 non-null  object
 8   afid                 113685 non-null  object
 9   affilname            113685 non-null  object
 10  affiliation_city     113685 non-null  object
 11  affiliation_country  113685 non-null  object
 12  author_count         145511 non-null  object
 13  author_names         145511 non-null  object
 14  author_ids           145511 non-null  object
 15  author_afids         113686 non-nu

In [38]:
start_journal = 500
end_journal = 1000
df = pd.DataFrame()
journals = top_10_percent['Issn'][start_journal:end_journal].values

for issn in tqdm(journals): 
    issn_list = issn.replace(" ", "").split(',')
    comma_count = issn.count(',')
    if comma_count == 0:
        s = ScopusSearch(f'ISSN ( "{issn_list[0]}" )') 
    elif comma_count == 1:
        s = ScopusSearch(f'ISSN ( "{issn_list[0]}" ) OR ISSN ( "{issn_list[1]}" )') 
    else:
        s = ScopusSearch(f'ISSN ( "{issn_list[0]}" ) OR ISSN ( "{issn_list[1]}" ) OR ISSN ( "{issn_list[2]}" )') 
    df = pd.concat([df, pd.DataFrame(s.results)], ignore_index=True)

In [None]:
data1.to_parquet('./retractions_data/data1.parquet')

In [None]:
data2 = scopus_search(500, 1000)

In [None]:
data2.to_csv('./retractions_data/data2.csv')

In [None]:
data3 = scopus_search(1000, 1500)

In [None]:
data3.to_csv('./retractions_data/data3.csv')

In [None]:
data4 = scopus_search(1500, 1804)

In [None]:
data4.to_csv('./retractions_data/data4.csv')

In [None]:
articles_info = pd.concat(data1, data2, data3, data4 ignore_index=True)

# Citations

In [None]:
articles_info = rw.merge(articles_info, how = 'left', left_on = 'OriginalPaperDOI', right_on = 'doi')

In [18]:
co = CitationOverview(["85068268027", "84930616647"], start=2019, end=2021)

Scopus403Error: Requestor configuration settings insufficient for access to this resource.

In [16]:
co

NameError: name 'co' is not defined

In [None]:
def citation_info(start_article, end_article):
    

# Authors