## Setup

In [1]:
import pandas as pd
import json
import pathlib
from typing import List

- [/] Title and abstract

- [/] Document classification codes

- [/] Date of publication

- [/] Affiliations

- [/] Citations / references
    - [/] Cited by count
    - [/] Reference count
    - [/] References

- [/] Keywords

In [2]:
def get(data: dict, path: str | List[str]):
    if type(path) is not list:
        path = path.split(".")
    for property in path:
        if type(data) is not dict or property not in data:
            return None
        data = data[property]
    return data

In [3]:
def ensure_list_data(x):
    return x if isinstance(x, list) else [x]

## Reading data

In [10]:
df = pd.read_csv('data/processed/merged.csv', index_col=0)

In [11]:
df["publish_date"] = pd.to_datetime(df["publish_date"])

In [12]:
list_columns = df.loc[:, ['classification_codes', 'affiliations', 'references', 'keywords']]
list_columns = list_columns.apply(ensure_list_data)

In [13]:
df

Unnamed: 0,id,title,publication_name,abstract,publish_date,cited_by_count,reference_count,classification_codes,affiliations,references,keywords
0,2-s2.0-85052201238,Parametric study of hydrogen production via so...,Chemical Engineering Science,© 2018 Elsevier LtdComputational fluid dynamic...,2018-12-31,21.0,42.0,"[{'@_fa': 'true', '$': 'Chemistry (all)', '@co...","{'affiliation-city': 'Bangkok', '@id': '600281...","[{'ref-fulltext': 'Abanades, J.C., Anthony, E....","[{'@_fa': 'true', '$': 'Circulating fluidized ..."
1,2-s2.0-85053455291,The phenotypic and mutational spectrum of Thai...,Gene,© 2018 Elsevier B.V.Ornithine transcarbamylase...,2018-12-30,13.0,20.0,"[{'@_fa': 'true', '$': 'Genetics', '@code': '1...","[{'affiliation-city': 'Bangkok', '@id': '60028...","[{'ref-fulltext': 'Brassier, A., Gobin, S., Ar...","[{'@_fa': 'true', '$': 'Female'}, {'@_fa': 'tr..."
2,2-s2.0-85058878790,PH Variation as a Simple and Selective Pathway...,Langmuir,© 2018 American Chemical Society. The fabricat...,2018-12-26,3.0,36.0,"[{'@_fa': 'true', '$': 'Materials Science (all...","{'affiliation-city': 'Bangkok', '@id': '600281...","[{'ref-fulltext': 'Chronopoulou, L.; Fratoddi,...",
3,2-s2.0-85061895439,Classification of advertisement text on Facebo...,ACM International Conference Proceeding Series,© 2018 Association for Computing Machinery.Und...,2018-12-21,0.0,11.0,"[{'@_fa': 'true', '$': 'Software', '@code': '1...","{'affiliation-city': 'Bangkok', '@id': '600281...",[{'ref-fulltext': 'S. Leesa-nguansuk. Thailand...,"[{'@_fa': 'true', '$': 'AISAS model'}, {'@_fa'..."
4,2-s2.0-85059290811,Search for Leptoquarks Coupled to Third-Genera...,Physical Review Letters,© 2018 CERN.for the CMS Collaboration. Publish...,2018-12-12,51.0,101.0,"[{'@_fa': 'true', '$': 'Physics and Astronomy ...","[{'affiliation-city': 'Aachen', '@id': '600166...",[{'ref-fulltext': 'J. P. Lees (BABAR Collabora...,
...,...,...,...,...,...,...,...,...,...,...,...
20211,2-s2.0-85139854052,Breastfeeding duration is associated with high...,Maternal and Child Nutrition,© 2022 The Authors. Maternal & Child Nutrition...,2023-01-01,0.0,44.0,"[{'@_fa': 'true', '$': 'Pediatrics, Perinatolo...","[{'affiliation-city': 'Bangkok', '@id': '60002...","[{'ref-fulltext': 'Aris, I. M., Bernard, J. Y....","[{'@_fa': 'true', '$': 'adiposity'}, {'@_fa': ..."
20212,2-s2.0-85135838525,Effects of ethylenediaminetetraacetic acid on ...,Journal of Dental Sciences,© 2022 Association for Dental Sciences of the ...,2023-01-01,0.0,18.0,"[{'@_fa': 'true', '$': 'Dentistry (all)', '@co...","[{'affiliation-city': 'Bangkok', '@id': '60028...","[{'ref-fulltext': 'Ong, T.K., Lim, G.S., Singh...","[{'@_fa': 'true', '$': 'Cell apoptosis'}, {'@_..."
20213,2-s2.0-85130889675,Factors associated with dental caries experien...,Special Care in Dentistry,© 2022 Special Care Dentistry Association and ...,2023-01-01,0.0,44.0,"[{'@_fa': 'true', '$': 'Dentistry (all)', '@co...","[{'affiliation-city': 'Pathum Thani', '@id': '...","[{'ref-fulltext': 'Jones MW, Morgan E, Shelton...","[{'@_fa': 'true', '$': 'carbohydrate snack'}, ..."
20214,2-s2.0-85121759166,"Preparation, optimization using a mixture desi...",Journal of Dispersion Science and Technology,"© 2021 Taylor & Francis Group, LLC.This resear...",2023-01-01,1.0,47.0,"[{'@_fa': 'true', '$': 'Surfaces, Coatings and...","{'affiliation-city': 'Bangkok', '@id': '600281...","[{'ref-fulltext': 'Higuera-Ciapara, I.; Félix-...","[{'@_fa': 'true', '$': 'Astaxanthin'}, {'@_fa'..."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20216 entries, 0 to 20215
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    20216 non-null  object        
 1   title                 20215 non-null  object        
 2   publication_name      20216 non-null  object        
 3   abstract              19551 non-null  object        
 4   publish_date          20216 non-null  datetime64[ns]
 5   cited_by_count        20212 non-null  float64       
 6   reference_count       19805 non-null  float64       
 7   classification_codes  20216 non-null  object        
 8   affiliations          20216 non-null  object        
 9   references            19805 non-null  object        
 10  keywords              16454 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(8)
memory usage: 1.9+ MB


## Tables

### Papers

In [15]:
papers_df = df.drop(columns=['classification_codes', 'affiliations', 'references', 'keywords'])
papers_df

Unnamed: 0,id,title,publication_name,abstract,publish_date,cited_by_count,reference_count
0,2-s2.0-85052201238,Parametric study of hydrogen production via so...,Chemical Engineering Science,© 2018 Elsevier LtdComputational fluid dynamic...,2018-12-31,21.0,42.0
1,2-s2.0-85053455291,The phenotypic and mutational spectrum of Thai...,Gene,© 2018 Elsevier B.V.Ornithine transcarbamylase...,2018-12-30,13.0,20.0
2,2-s2.0-85058878790,PH Variation as a Simple and Selective Pathway...,Langmuir,© 2018 American Chemical Society. The fabricat...,2018-12-26,3.0,36.0
3,2-s2.0-85061895439,Classification of advertisement text on Facebo...,ACM International Conference Proceeding Series,© 2018 Association for Computing Machinery.Und...,2018-12-21,0.0,11.0
4,2-s2.0-85059290811,Search for Leptoquarks Coupled to Third-Genera...,Physical Review Letters,© 2018 CERN.for the CMS Collaboration. Publish...,2018-12-12,51.0,101.0
...,...,...,...,...,...,...,...
20211,2-s2.0-85139854052,Breastfeeding duration is associated with high...,Maternal and Child Nutrition,© 2022 The Authors. Maternal & Child Nutrition...,2023-01-01,0.0,44.0
20212,2-s2.0-85135838525,Effects of ethylenediaminetetraacetic acid on ...,Journal of Dental Sciences,© 2022 Association for Dental Sciences of the ...,2023-01-01,0.0,18.0
20213,2-s2.0-85130889675,Factors associated with dental caries experien...,Special Care in Dentistry,© 2022 Special Care Dentistry Association and ...,2023-01-01,0.0,44.0
20214,2-s2.0-85121759166,"Preparation, optimization using a mixture desi...",Journal of Dispersion Science and Technology,"© 2021 Taylor & Francis Group, LLC.This resear...",2023-01-01,1.0,47.0


### Classification codes

In [27]:
classification_codes_df = df.explode('classification_codes', ignore_index=True)[['id', 'classification_codes']]
classification_codes_df.rename(columns={
    'id': 'paper_id',
    'classification_codes': 'classification_code'
}, inplace=True)
classification_codes_df

Unnamed: 0,paper_id,classification_code
0,2-s2.0-85052201238,"[{'@_fa': 'true', '$': 'Chemistry (all)', '@co..."
1,2-s2.0-85053455291,"[{'@_fa': 'true', '$': 'Genetics', '@code': '1..."
2,2-s2.0-85058878790,"[{'@_fa': 'true', '$': 'Materials Science (all..."
3,2-s2.0-85061895439,"[{'@_fa': 'true', '$': 'Software', '@code': '1..."
4,2-s2.0-85059290811,"[{'@_fa': 'true', '$': 'Physics and Astronomy ..."
...,...,...
20211,2-s2.0-85139854052,"[{'@_fa': 'true', '$': 'Pediatrics, Perinatolo..."
20212,2-s2.0-85135838525,"[{'@_fa': 'true', '$': 'Dentistry (all)', '@co..."
20213,2-s2.0-85130889675,"[{'@_fa': 'true', '$': 'Dentistry (all)', '@co..."
20214,2-s2.0-85121759166,"[{'@_fa': 'true', '$': 'Surfaces, Coatings and..."


In [22]:
classification_codes_df = classification_codes_df[['paper_id']].join(pd.json_normalize(classification_codes_df['classification_code']))
classification_codes_df

Unnamed: 0,paper_id
0,2-s2.0-85052201238
1,2-s2.0-85053455291
2,2-s2.0-85058878790
3,2-s2.0-85061895439
4,2-s2.0-85059290811
...,...
20211,2-s2.0-85139854052
20212,2-s2.0-85135838525
20213,2-s2.0-85130889675
20214,2-s2.0-85121759166


In [12]:
classification_codes_df.drop(columns=['@_fa'], inplace=True)
classification_codes_df

Unnamed: 0,paper_id,$,@code,@abbrev
0,2-s2.0-85058392278,Environmental Chemistry,2304,ENVI
1,2-s2.0-85058392278,Chemical Engineering (all),1500,CENG
2,2-s2.0-85058392278,Materials Science (all),2500,MATE
3,2-s2.0-85058392278,Energy (all),2100,ENER
4,2-s2.0-85059476683,Physics and Astronomy (all),3100,PHYS
...,...,...,...,...
2480,2-s2.0-85139791256,Arts and Humanities (miscellaneous),1201,ARTS
2481,2-s2.0-85139245926,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,2-s2.0-85139245926,Anesthesiology and Pain Medicine,2703,MEDI
2483,2-s2.0-85138187650,Food Science,1106,AGRI


In [13]:
classification_codes_df.rename(columns={
    '$': 'name',
    '@code': 'code',
    '@abbrev': 'abbreviation'
}, inplace=True)
classification_codes_df

Unnamed: 0,paper_id,name,code,abbreviation
0,2-s2.0-85058392278,Environmental Chemistry,2304,ENVI
1,2-s2.0-85058392278,Chemical Engineering (all),1500,CENG
2,2-s2.0-85058392278,Materials Science (all),2500,MATE
3,2-s2.0-85058392278,Energy (all),2100,ENER
4,2-s2.0-85059476683,Physics and Astronomy (all),3100,PHYS
...,...,...,...,...
2480,2-s2.0-85139791256,Arts and Humanities (miscellaneous),1201,ARTS
2481,2-s2.0-85139245926,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,2-s2.0-85139245926,Anesthesiology and Pain Medicine,2703,MEDI
2483,2-s2.0-85138187650,Food Science,1106,AGRI


In [14]:
paper_to_classification_code_df = classification_codes_df.loc[:, ['paper_id', 'code']]
paper_to_classification_code_df

Unnamed: 0,paper_id,code
0,2-s2.0-85058392278,2304
1,2-s2.0-85058392278,1500
2,2-s2.0-85058392278,2500
3,2-s2.0-85058392278,2100
4,2-s2.0-85059476683,3100
...,...,...
2480,2-s2.0-85139791256,1201
2481,2-s2.0-85139245926,2735
2482,2-s2.0-85139245926,2703
2483,2-s2.0-85138187650,1106


In [15]:
classification_codes_df.drop(columns=['paper_id'], inplace=True)
classification_codes_df

Unnamed: 0,name,code,abbreviation
0,Environmental Chemistry,2304,ENVI
1,Chemical Engineering (all),1500,CENG
2,Materials Science (all),2500,MATE
3,Energy (all),2100,ENER
4,Physics and Astronomy (all),3100,PHYS
...,...,...,...
2480,Arts and Humanities (miscellaneous),1201,ARTS
2481,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,Anesthesiology and Pain Medicine,2703,MEDI
2483,Food Science,1106,AGRI


### Affiliations

In [16]:
affiliations_df = df.explode('affiliations', ignore_index=True)[['id', 'affiliations']]
affiliations_df.rename(columns={
    'id': 'paper_id',
    'affiliations': 'affiliation'
}, inplace=True)
affiliations_df

Unnamed: 0,paper_id,affiliation
0,2-s2.0-85058392278,"{'affiliation-city': 'Rayong', '@id': '6011078..."
1,2-s2.0-85058392278,"{'affiliation-city': 'Bangkok', '@id': '600915..."
2,2-s2.0-85058392278,"{'affiliation-city': 'Arica', '@id': '60030782..."
3,2-s2.0-85058392278,"{'affiliation-city': 'Qinhuangdao', '@id': '60..."
4,2-s2.0-85059476683,"{'affiliation-city': 'Bangkok', '@id': '600281..."
...,...,...
12625,2-s2.0-85138187650,"{'affiliation-city': 'Mumbai', '@id': '6001415..."
12626,2-s2.0-85138187650,"{'affiliation-city': 'Hangzhou', '@id': '60003..."
12627,2-s2.0-85119492522,"{'affiliation-city': 'Bangkok', '@id': '600221..."
12628,2-s2.0-85119492522,"{'affiliation-city': 'London', '@id': '6001132..."


In [17]:
affiliations_df = affiliations_df[["paper_id"]].join(pd.json_normalize(affiliations_df['affiliation']))
affiliations_df

Unnamed: 0,paper_id,affiliation-city,@id,affilname,@href,affiliation-country
0,2-s2.0-85058392278,Rayong,60110787,Vidyasirimedhi Institute of Science and Techno...,https://api.elsevier.com/content/affiliation/a...,Thailand
1,2-s2.0-85058392278,Bangkok,60091507,Metallurgy and Materials Research Institute Ch...,https://api.elsevier.com/content/affiliation/a...,Thailand
2,2-s2.0-85058392278,Arica,60030782,Universidad de Tarapacá,https://api.elsevier.com/content/affiliation/a...,Chile
3,2-s2.0-85058392278,Qinhuangdao,60018465,Yanshan University,https://api.elsevier.com/content/affiliation/a...,China
4,2-s2.0-85059476683,Bangkok,60028190,Chulalongkorn University,https://api.elsevier.com/content/affiliation/a...,Thailand
...,...,...,...,...,...,...
12625,2-s2.0-85138187650,Mumbai,60014153,Indian Institute of Technology Bombay,https://api.elsevier.com/content/affiliation/a...,India
12626,2-s2.0-85138187650,Hangzhou,60003970,Zhejiang University,https://api.elsevier.com/content/affiliation/a...,China
12627,2-s2.0-85119492522,Bangkok,60022183,King Chulalongkorn Memorial Hospital,https://api.elsevier.com/content/affiliation/a...,Thailand
12628,2-s2.0-85119492522,London,60011326,UCL Ear Institute,https://api.elsevier.com/content/affiliation/a...,United Kingdom


In [18]:
paper_to_affiliations_df = affiliations_df.loc[:, ['paper_id', '@id']]
paper_to_affiliations_df.rename(columns={
    '@id': 'affiliation_id'
}, inplace=True)
paper_to_affiliations_df

Unnamed: 0,paper_id,affiliation_id
0,2-s2.0-85058392278,60110787
1,2-s2.0-85058392278,60091507
2,2-s2.0-85058392278,60030782
3,2-s2.0-85058392278,60018465
4,2-s2.0-85059476683,60028190
...,...,...
12625,2-s2.0-85138187650,60014153
12626,2-s2.0-85138187650,60003970
12627,2-s2.0-85119492522,60022183
12628,2-s2.0-85119492522,60011326


In [19]:
affiliations_df = affiliations_df.drop(columns=['paper_id']).drop_duplicates().reset_index(drop=True)
affiliations_df

Unnamed: 0,affiliation-city,@id,affilname,@href,affiliation-country
0,Rayong,60110787,Vidyasirimedhi Institute of Science and Techno...,https://api.elsevier.com/content/affiliation/a...,Thailand
1,Bangkok,60091507,Metallurgy and Materials Research Institute Ch...,https://api.elsevier.com/content/affiliation/a...,Thailand
2,Arica,60030782,Universidad de Tarapacá,https://api.elsevier.com/content/affiliation/a...,Chile
3,Qinhuangdao,60018465,Yanshan University,https://api.elsevier.com/content/affiliation/a...,China
4,Bangkok,60028190,Chulalongkorn University,https://api.elsevier.com/content/affiliation/a...,Thailand
...,...,...,...,...,...
1946,Shanghai,60112527,College of Sciences,https://api.elsevier.com/content/affiliation/a...,China
1947,Fortaleza,60015440,Universidade de Fortaleza,https://api.elsevier.com/content/affiliation/a...,Brazil
1948,Bangkok,126790845,Academy of Science,https://api.elsevier.com/content/affiliation/a...,Thailand
1949,Chetumal,60276903,El Colegio de la Frontera Sur,https://api.elsevier.com/content/affiliation/a...,Mexico


### References

In [20]:
references_df = df.explode('references', ignore_index=True)[['id', 'references']]
references_df.rename(columns={
    'id': 'paper_id',
    'references': 'reference'
}, inplace=True)
references_df

Unnamed: 0,paper_id,reference
0,2-s2.0-85058392278,"{'ref-fulltext': 'T. Inoue, A. Fujishima, S. K..."
1,2-s2.0-85058392278,"{'ref-fulltext': 'S. N. Frank, A. J. Bard, J. ..."
2,2-s2.0-85058392278,"{'ref-fulltext': 'S. A. Grinshpun, A. Adhikari..."
3,2-s2.0-85058392278,"{'ref-fulltext': 'C. Wei, W. Y. Lin, Z. Zainal..."
4,2-s2.0-85058392278,"{'ref-fulltext': 'S. In, A. Orlov, R. Berg, F...."
...,...,...
45386,2-s2.0-85119492522,"{'ref-fulltext': 'Yanilmaz, M., Akduman, D., S..."
45387,2-s2.0-85119492522,"{'ref-fulltext': 'Yildirim, M.A., Karlidag, T...."
45388,2-s2.0-85119492522,"{'ref-fulltext': 'Karlidag, T., Yildiz, M., Ya..."
45389,2-s2.0-85119492522,"{'ref-fulltext': 'Xiong, Y., Rabchevsky, A.G.,..."


In [21]:
references_df[references_df['paper_id'] == '2-s2.0-85106046890']

Unnamed: 0,paper_id,reference


In [22]:
references_df = references_df.join(pd.json_normalize(references_df["reference"])).drop(columns=['reference'])
references_df

Unnamed: 0,paper_id,ref-fulltext,@id,ref-info.ref-publicationyear.@first,ref-info.refd-itemidlist.itemid,ref-info.ref-volisspag.voliss.@volume,ref-info.ref-volisspag.pagerange.@first,ref-info.ref-authors.author,ref-info.ref-sourcetitle,ref-info.ref-volisspag.pagerange.@last,...,ref-info.ref-authors.et-al,@aii:was-generated-by,@reference-instance-id,ref-info.ref-volisspag.pages,ref-info.ref-publicationyear.@last,ref-info.ref-website.websitename,@date-locked,ref-info.ref-authors.collaboration,ref-info.ref-volisspag.pagecount.$,ref-info.ref-volisspag.pagecount.@type
0,2-s2.0-85058392278,"T. Inoue, A. Fujishima, S. Konishi, K. Honda, ...",1,1979,"[{'$': 'cssc201802284-cit-0001', '@idtype': 'F...",277,637,"[{'@seq': '1', 'ce:initials': 'T.', '@_fa': 't...",Nature,,...,,,,,,,,,,
1,2-s2.0-85058392278,"S. N. Frank, A. J. Bard, J. Am. Chem. Soc. 197...",2,1977,"[{'$': 'cssc201802284-cit-0002', '@idtype': 'F...",99,303,"[{'@seq': '1', 'ce:initials': 'S.N.', '@_fa': ...",J. Am. Chem. Soc.,304,...,,,,,,,,,,
2,2-s2.0-85058392278,"S. A. Grinshpun, A. Adhikari, T. Honda, K. Y. ...",3,2007,"[{'$': 'cssc201802284-cit-0003', '@idtype': 'F...",41,606,"[{'@seq': '1', 'ce:initials': 'S.A.', '@_fa': ...",Environ. Sci. Technol.,612,...,,,,,,,,,,
3,2-s2.0-85058392278,"C. Wei, W. Y. Lin, Z. Zainal, N. E. Williams, ...",4,1994,"[{'$': 'cssc201802284-cit-0004', '@idtype': 'F...",28,934,"[{'@seq': '1', 'ce:initials': 'C.', '@_fa': 't...",Environ. Sci. Technol.,938,...,,,,,,,,,,
4,2-s2.0-85058392278,"S. In, A. Orlov, R. Berg, F. García, S. Pedros...",5,2007,"[{'$': 'cssc201802284-cit-0005', '@idtype': 'F...",129,13790,"[{'@seq': '1', 'ce:initials': 'S.', '@_fa': 't...",J. Am. Chem. Soc.,13791,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45386,2-s2.0-85119492522,"Yanilmaz, M., Akduman, D., Sagun, ÖF., Hakseve...",29,2015,"[{'$': 'bib0145', '@idtype': 'FRAGMENTID'}, {'...",26,667,"[{'@seq': '1', 'ce:initials': 'M.', '@_fa': 't...",J Craniofac Surg.,672,...,,,,,,,,,,
45387,2-s2.0-85119492522,"Yildirim, M.A., Karlidag, T., Akpolat, N., Kay...",30,2015,"[{'$': 'bib0150', '@idtype': 'FRAGMENTID'}, {'...",26,810,"[{'@seq': '1', 'ce:initials': 'M.A.', '@_fa': ...",J Craniofac Surg.,815,...,,,,,,,,,,
45388,2-s2.0-85119492522,"Karlidag, T., Yildiz, M., Yalcin, S., Colakogl...",31,2012,"[{'$': 'bib0155', '@idtype': 'FRAGMENTID'}, {'...",39,145,"[{'@seq': '1', 'ce:initials': 'T.', '@_fa': 't...",Auris Nasus Larynx.,150,...,,,,,,,,,,
45389,2-s2.0-85119492522,"Xiong, Y., Rabchevsky, A.G., Hall, E.D., Role ...",32,2007,"[{'$': 'bib0160', '@idtype': 'FRAGMENTID'}, {'...",100,639,"[{'@seq': '1', 'ce:initials': 'Y.', '@_fa': 't...",J Neurochem.,649,...,,,,,,,,,,


In [23]:
references_df[references_df['ref-info.ref-title.ref-titletext'] == 'What is the Impact of International Remittances on Poverty and Inequality in Latin America?']

Unnamed: 0,paper_id,ref-fulltext,@id,ref-info.ref-publicationyear.@first,ref-info.refd-itemidlist.itemid,ref-info.ref-volisspag.voliss.@volume,ref-info.ref-volisspag.pagerange.@first,ref-info.ref-authors.author,ref-info.ref-sourcetitle,ref-info.ref-volisspag.pagerange.@last,...,ref-info.ref-authors.et-al,@aii:was-generated-by,@reference-instance-id,ref-info.ref-volisspag.pages,ref-info.ref-publicationyear.@last,ref-info.ref-website.websitename,@date-locked,ref-info.ref-authors.collaboration,ref-info.ref-volisspag.pagecount.$,ref-info.ref-volisspag.pagecount.@type


In [24]:
references_df['ref-fulltext'].value_counts()

ref-fulltext
CMS collaboration, The CMS trigger system, 2017 JINST 12 P01020 [arXiv:1609.02366] [INSPIRE].                                                                                                                                                                                                                                                                                    11
CMS collaboration, Event generator tunes obtained from underlying event and multiparton scattering measurements, Eur. Phys. J. C 76 (2016) 155 [arXiv:1512.00815] [INSPIRE].                                                                                                                                                                                                     10
CMS collaboration, Particle-flow reconstruction and global event description with the CMS detector, 2017 JINST 12 P10003 [arXiv:1706.04965] [INSPIRE].                                                                                             

### Keywords

In [25]:
keywords_df = df.explode('keywords', ignore_index=True)[['id', 'keywords']]
keywords_df.rename(columns={
    'id': 'paper_id',
    'keywords': 'keyword'
}, inplace=True)
keywords_df

Unnamed: 0,paper_id,keyword
0,2-s2.0-85058392278,"{'@_fa': 'true', '$': '2D materials'}"
1,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'graphitic carbon nitride'}"
2,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'heterostructured compos..."
3,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'hydrogen evolution'}"
4,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'photocatalysis'}"
...,...,...
4408,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Complete facial nerve t..."
4409,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Corticosteroid'}"
4410,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Facial nerve neurorrhap..."
4411,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Functional recovery'}"


In [26]:
keywords_df = keywords_df.join(pd.json_normalize(keywords_df["keyword"])).drop(columns=['keyword'])
keywords_df

Unnamed: 0,paper_id,@_fa,$
0,2-s2.0-85058392278,true,2D materials
1,2-s2.0-85058392278,true,graphitic carbon nitride
2,2-s2.0-85058392278,true,heterostructured composites
3,2-s2.0-85058392278,true,hydrogen evolution
4,2-s2.0-85058392278,true,photocatalysis
...,...,...,...
4408,2-s2.0-85119492522,true,Complete facial nerve transection
4409,2-s2.0-85119492522,true,Corticosteroid
4410,2-s2.0-85119492522,true,Facial nerve neurorrhaphy
4411,2-s2.0-85119492522,true,Functional recovery


In [27]:
keywords_df = keywords_df.loc[:, ['paper_id', '$']]
keywords_df.rename(columns={
    '$': 'keyword'
}, inplace=True)
keywords_df

Unnamed: 0,paper_id,keyword
0,2-s2.0-85058392278,2D materials
1,2-s2.0-85058392278,graphitic carbon nitride
2,2-s2.0-85058392278,heterostructured composites
3,2-s2.0-85058392278,hydrogen evolution
4,2-s2.0-85058392278,photocatalysis
...,...,...
4408,2-s2.0-85119492522,Complete facial nerve transection
4409,2-s2.0-85119492522,Corticosteroid
4410,2-s2.0-85119492522,Facial nerve neurorrhaphy
4411,2-s2.0-85119492522,Functional recovery


In [28]:
paper_to_keywords_df = keywords_df

In [29]:
keywords_df = keywords_df.drop(columns=['paper_id']).drop_duplicates()
keywords_df

Unnamed: 0,keyword
0,2D materials
1,graphitic carbon nitride
2,heterostructured composites
3,hydrogen evolution
4,photocatalysis
...,...
4408,Complete facial nerve transection
4409,Corticosteroid
4410,Facial nerve neurorrhaphy
4411,Functional recovery


## Buckets (for multiprocessing)

In [30]:
buckets = {i: [] for i in range(12)}

for path in pathlib.Path("Data 2018-2023/Project").glob("*/*"):
    buckets[path.__hash__() % 12].append(path)

# buckets