## Setup

In [1]:
import pandas as pd
import json
import pathlib
from typing import List

- [/] Title and abstract

- [/] Document classification codes

- [/] Date of publication

- [/] Affiliations

- [/] Citations / references
    - [/] Cited by count
    - [/] Reference count
    - [/] References

- [/] Keywords

In [2]:
def get(data: dict, path: str | List[str]):
    if type(path) is not list:
        path = path.split(".")
    for property in path:
        if type(data) is not dict or property not in data:
            return None
        data = data[property]
    return data

In [3]:
def ensure_list_data(x):
    return x if isinstance(x, list) else [x]

## Reading data

In [4]:
data_list = []
for path in pathlib.Path("Data 2018-2023/Project").glob("*/*"):
    if path.__hash__() % 20 != 0: # Run on a sample of data
        continue
    try:
        with open(path) as file:
            data = json.load(file)
        filtered_data = {}
        filtered_data['id'] = get(data, ['abstracts-retrieval-response', 'coredata', 'eid'])
        filtered_data['title'] = get(data, ['abstracts-retrieval-response', 'coredata', 'dc:title'])
        filtered_data["publication_name"] = get(data, ["abstracts-retrieval-response", "coredata", "prism:publicationName"],)
        filtered_data['abstract'] = get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', 'head', 'abstracts'])
        filtered_data['publish_date'] = get(data, ['abstracts-retrieval-response', 'coredata', 'prism:coverDate'])
        filtered_data['cited_by_count'] = get(data, ['abstracts-retrieval-response', 'coredata', 'citedby-count'])
        filtered_data['reference_count'] = get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', "tail", "bibliography", "@refcount"])
        filtered_data['classification_codes'] = get(data, ['abstracts-retrieval-response', 'subject-areas', 'subject-area'])
        filtered_data['affiliations'] = get(data, ['abstracts-retrieval-response', 'affiliation'])
        filtered_data['references'] = get(data, ['abstracts-retrieval-response', 'item', 'bibrecord', "tail", "bibliography", "reference"])
        filtered_data['keywords'] = get(data, ['abstracts-retrieval-response', 'authkeywords', 'author-keyword'])
        data_list.append(filtered_data)
    except Exception as e:
        print(f"Failed: {path}\nException: {repr(e)}")
print("Reading files done.")
df = pd.DataFrame(data_list)

Reading files done.


In [5]:
df["publish_date"] = pd.to_datetime(df["publish_date"])

In [6]:
list_columns = df.loc[:, ['classification_codes', 'affiliations', 'references', 'keywords']]
list_columns = list_columns.apply(ensure_list_data)

In [7]:
df

Unnamed: 0,id,title,publication_name,abstract,publish_date,cited_by_count,reference_count,classification_codes,affiliations,references,keywords
0,2-s2.0-85058392278,Heterostructured d-Ti 3 C 2 /TiO 2/ g-C 3 N 4 ...,ChemSusChem,"© 2018 Wiley-VCH Verlag GmbH & Co. KGaA, Weinh...",2018-12-20,105,74,"[{'@_fa': 'true', '$': 'Environmental Chemistr...","[{'affiliation-city': 'Rayong', '@id': '601107...","[{'ref-fulltext': 'T. Inoue, A. Fujishima, S. ...","[{'@_fa': 'true', '$': '2D materials'}, {'@_fa..."
1,2-s2.0-85059476683,CIGS thin film solar cells with graded-band ga...,Journal of Physics: Conference Series,© Published under licence by IOP Publishing Lt...,2018-12-19,5,13,"[{'@_fa': 'true', '$': 'Physics and Astronomy ...","[{'affiliation-city': 'Bangkok', '@id': '60028...",[{'ref-fulltext': 'Yoshida S 2017 Solar Fronti...,
2,2-s2.0-85060496074,Design of output feedback nonlinear model pred...,"International Conference on Control, Automatio...",© ICROS.This paper presents designing of outpu...,2018-12-10,0,9,"[{'@_fa': 'true', '$': 'Artificial Intelligenc...","[{'affiliation-city': 'Bangkok', '@id': '60028...","[{'ref-fulltext': 'D. Gu and H. Hu, ""Receding ...","[{'@_fa': 'true', '$': 'Inverted pendulum on c..."
3,2-s2.0-85066101363,Effect of modified Khon dance performance on f...,Journal of Health Research,"© 2018, Ladawan Chutimakul, Suchitra Sukonthas...",2018-12-06,3,24,"[{'@_fa': 'true', '$': 'Health Policy', '@code...","[{'affiliation-city': 'Bangkok', '@id': '60028...","[{'ref-fulltext': '1.United Nations, New York,...","[{'@_fa': 'true', '$': 'Functional fitness'}, ..."
4,2-s2.0-85120882431,A preliminary study on diversity of midgut mic...,Thai Journal of Veterinary Medicine,© 2018 Chulalongkorn University Printing House...,2018-12-01,2,42,"[{'@_fa': 'true', '$': 'Veterinary (all)', '@c...","{'affiliation-city': 'Bangkok', '@id': '600281...",[{'@aii:was-generated-by': 'http://data.elsevi...,"[{'@_fa': 'true', '$': 'Aedes aegypti'}, {'@_f..."
...,...,...,...,...,...,...,...,...,...,...,...
1036,2-s2.0-85139943983,Acute kidney injury in the tropics,Nephrology,© 2022 Asian Pacific Society of Nephrology.The...,2023-01-01,0,226,"[{'@_fa': 'true', '$': 'Nephrology', '@code': ...","[{'affiliation-city': 'Bangkok', '@id': '60002...","[{'ref-fulltext': 'Burdmann EA, Jha V. Acute k...","[{'@_fa': 'true', '$': 'acute kidney injury'},..."
1037,2-s2.0-85139791256,Emancipatory agroecologies: social and politic...,Journal of Peasant Studies,"© 2022 Informa UK Limited, trading as Taylor &...",2023-01-01,5,109,"[{'@_fa': 'true', '$': 'Cultural Studies', '@c...","[{'affiliation-city': 'Chetumal', '@id': '6027...","[{'ref-fulltext': 'Alonso-Fradejas, A., L. F. ...","[{'@_fa': 'true', '$': 'Agroecology'}, {'@_fa'..."
1038,2-s2.0-85139245926,Factors associated with immediate and early ex...,Paediatric Anaesthesia,© 2022 John Wiley & Sons Ltd.Background: The p...,2023-01-01,2,22,"[{'@_fa': 'true', '$': 'Pediatrics, Perinatolo...","{'affiliation-city': 'Bangkok', '@id': '600026...","[{'ref-fulltext': 'Kasahara M, Sakamoto S, Fuk...","[{'@_fa': 'true', '$': 'airway extubation'}, {..."
1039,2-s2.0-85138187650,The thermal resistance and targeting release o...,Journal of Food Science and Technology,"© 2022, Association of Food Scientists & Techn...",2023-01-01,3,33,"[{'@_fa': 'true', '$': 'Food Science', '@code'...","[{'affiliation-city': 'Hangzhou', '@id': '6011...","[{'ref-fulltext': 'Amiri S, Ghanbarzadeh B, Ha...","[{'@_fa': 'true', '$': 'Controlled release'}, ..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041 entries, 0 to 1040
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    1041 non-null   object        
 1   title                 1041 non-null   object        
 2   publication_name      1041 non-null   object        
 3   abstract              1021 non-null   object        
 4   publish_date          1041 non-null   datetime64[ns]
 5   cited_by_count        1041 non-null   object        
 6   reference_count       1018 non-null   object        
 7   classification_codes  1041 non-null   object        
 8   affiliations          1041 non-null   object        
 9   references            1018 non-null   object        
 10  keywords              841 non-null    object        
dtypes: datetime64[ns](1), object(10)
memory usage: 89.6+ KB


## Tables

### Papers

In [9]:
papers_df = df.drop(columns=['classification_codes', 'affiliations', 'references', 'keywords'])
papers_df

Unnamed: 0,id,title,publication_name,abstract,publish_date,cited_by_count,reference_count
0,2-s2.0-85058392278,Heterostructured d-Ti 3 C 2 /TiO 2/ g-C 3 N 4 ...,ChemSusChem,"© 2018 Wiley-VCH Verlag GmbH & Co. KGaA, Weinh...",2018-12-20,105,74
1,2-s2.0-85059476683,CIGS thin film solar cells with graded-band ga...,Journal of Physics: Conference Series,© Published under licence by IOP Publishing Lt...,2018-12-19,5,13
2,2-s2.0-85060496074,Design of output feedback nonlinear model pred...,"International Conference on Control, Automatio...",© ICROS.This paper presents designing of outpu...,2018-12-10,0,9
3,2-s2.0-85066101363,Effect of modified Khon dance performance on f...,Journal of Health Research,"© 2018, Ladawan Chutimakul, Suchitra Sukonthas...",2018-12-06,3,24
4,2-s2.0-85120882431,A preliminary study on diversity of midgut mic...,Thai Journal of Veterinary Medicine,© 2018 Chulalongkorn University Printing House...,2018-12-01,2,42
...,...,...,...,...,...,...,...
1036,2-s2.0-85139943983,Acute kidney injury in the tropics,Nephrology,© 2022 Asian Pacific Society of Nephrology.The...,2023-01-01,0,226
1037,2-s2.0-85139791256,Emancipatory agroecologies: social and politic...,Journal of Peasant Studies,"© 2022 Informa UK Limited, trading as Taylor &...",2023-01-01,5,109
1038,2-s2.0-85139245926,Factors associated with immediate and early ex...,Paediatric Anaesthesia,© 2022 John Wiley & Sons Ltd.Background: The p...,2023-01-01,2,22
1039,2-s2.0-85138187650,The thermal resistance and targeting release o...,Journal of Food Science and Technology,"© 2022, Association of Food Scientists & Techn...",2023-01-01,3,33


### Classification codes

In [10]:
classification_codes_df = df.explode('classification_codes', ignore_index=True)[['id', 'classification_codes']]
classification_codes_df.rename(columns={
    'id': 'paper_id',
    'classification_codes': 'classification_code'
}, inplace=True)
classification_codes_df

Unnamed: 0,paper_id,classification_code
0,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'Environmental Chemistry..."
1,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'Chemical Engineering (a..."
2,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'Materials Science (all)..."
3,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'Energy (all)', '@code':..."
4,2-s2.0-85059476683,"{'@_fa': 'true', '$': 'Physics and Astronomy (..."
...,...,...
2480,2-s2.0-85139791256,"{'@_fa': 'true', '$': 'Arts and Humanities (mi..."
2481,2-s2.0-85139245926,"{'@_fa': 'true', '$': 'Pediatrics, Perinatolog..."
2482,2-s2.0-85139245926,"{'@_fa': 'true', '$': 'Anesthesiology and Pain..."
2483,2-s2.0-85138187650,"{'@_fa': 'true', '$': 'Food Science', '@code':..."


In [11]:
classification_codes_df = classification_codes_df[['paper_id']].join(pd.json_normalize(classification_codes_df['classification_code']))
classification_codes_df

Unnamed: 0,paper_id,@_fa,$,@code,@abbrev
0,2-s2.0-85058392278,true,Environmental Chemistry,2304,ENVI
1,2-s2.0-85058392278,true,Chemical Engineering (all),1500,CENG
2,2-s2.0-85058392278,true,Materials Science (all),2500,MATE
3,2-s2.0-85058392278,true,Energy (all),2100,ENER
4,2-s2.0-85059476683,true,Physics and Astronomy (all),3100,PHYS
...,...,...,...,...,...
2480,2-s2.0-85139791256,true,Arts and Humanities (miscellaneous),1201,ARTS
2481,2-s2.0-85139245926,true,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,2-s2.0-85139245926,true,Anesthesiology and Pain Medicine,2703,MEDI
2483,2-s2.0-85138187650,true,Food Science,1106,AGRI


In [12]:
classification_codes_df.drop(columns=['@_fa'], inplace=True)
classification_codes_df

Unnamed: 0,paper_id,$,@code,@abbrev
0,2-s2.0-85058392278,Environmental Chemistry,2304,ENVI
1,2-s2.0-85058392278,Chemical Engineering (all),1500,CENG
2,2-s2.0-85058392278,Materials Science (all),2500,MATE
3,2-s2.0-85058392278,Energy (all),2100,ENER
4,2-s2.0-85059476683,Physics and Astronomy (all),3100,PHYS
...,...,...,...,...
2480,2-s2.0-85139791256,Arts and Humanities (miscellaneous),1201,ARTS
2481,2-s2.0-85139245926,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,2-s2.0-85139245926,Anesthesiology and Pain Medicine,2703,MEDI
2483,2-s2.0-85138187650,Food Science,1106,AGRI


In [13]:
classification_codes_df.rename(columns={
    '$': 'name',
    '@code': 'code',
    '@abbrev': 'abbreviation'
}, inplace=True)
classification_codes_df

Unnamed: 0,paper_id,name,code,abbreviation
0,2-s2.0-85058392278,Environmental Chemistry,2304,ENVI
1,2-s2.0-85058392278,Chemical Engineering (all),1500,CENG
2,2-s2.0-85058392278,Materials Science (all),2500,MATE
3,2-s2.0-85058392278,Energy (all),2100,ENER
4,2-s2.0-85059476683,Physics and Astronomy (all),3100,PHYS
...,...,...,...,...
2480,2-s2.0-85139791256,Arts and Humanities (miscellaneous),1201,ARTS
2481,2-s2.0-85139245926,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,2-s2.0-85139245926,Anesthesiology and Pain Medicine,2703,MEDI
2483,2-s2.0-85138187650,Food Science,1106,AGRI


In [14]:
paper_to_classification_code_df = classification_codes_df.loc[:, ['paper_id', 'code']]
paper_to_classification_code_df

Unnamed: 0,paper_id,code
0,2-s2.0-85058392278,2304
1,2-s2.0-85058392278,1500
2,2-s2.0-85058392278,2500
3,2-s2.0-85058392278,2100
4,2-s2.0-85059476683,3100
...,...,...
2480,2-s2.0-85139791256,1201
2481,2-s2.0-85139245926,2735
2482,2-s2.0-85139245926,2703
2483,2-s2.0-85138187650,1106


In [15]:
classification_codes_df.drop(columns=['paper_id'], inplace=True)
classification_codes_df

Unnamed: 0,name,code,abbreviation
0,Environmental Chemistry,2304,ENVI
1,Chemical Engineering (all),1500,CENG
2,Materials Science (all),2500,MATE
3,Energy (all),2100,ENER
4,Physics and Astronomy (all),3100,PHYS
...,...,...,...
2480,Arts and Humanities (miscellaneous),1201,ARTS
2481,"Pediatrics, Perinatology and Child Health",2735,MEDI
2482,Anesthesiology and Pain Medicine,2703,MEDI
2483,Food Science,1106,AGRI


### Affiliations

In [16]:
affiliations_df = df.explode('affiliations', ignore_index=True)[['id', 'affiliations']]
affiliations_df.rename(columns={
    'id': 'paper_id',
    'affiliations': 'affiliation'
}, inplace=True)
affiliations_df

Unnamed: 0,paper_id,affiliation
0,2-s2.0-85058392278,"{'affiliation-city': 'Rayong', '@id': '6011078..."
1,2-s2.0-85058392278,"{'affiliation-city': 'Bangkok', '@id': '600915..."
2,2-s2.0-85058392278,"{'affiliation-city': 'Arica', '@id': '60030782..."
3,2-s2.0-85058392278,"{'affiliation-city': 'Qinhuangdao', '@id': '60..."
4,2-s2.0-85059476683,"{'affiliation-city': 'Bangkok', '@id': '600281..."
...,...,...
12625,2-s2.0-85138187650,"{'affiliation-city': 'Mumbai', '@id': '6001415..."
12626,2-s2.0-85138187650,"{'affiliation-city': 'Hangzhou', '@id': '60003..."
12627,2-s2.0-85119492522,"{'affiliation-city': 'Bangkok', '@id': '600221..."
12628,2-s2.0-85119492522,"{'affiliation-city': 'London', '@id': '6001132..."


In [17]:
affiliations_df = affiliations_df[["paper_id"]].join(pd.json_normalize(affiliations_df['affiliation']))
affiliations_df

Unnamed: 0,paper_id,affiliation-city,@id,affilname,@href,affiliation-country
0,2-s2.0-85058392278,Rayong,60110787,Vidyasirimedhi Institute of Science and Techno...,https://api.elsevier.com/content/affiliation/a...,Thailand
1,2-s2.0-85058392278,Bangkok,60091507,Metallurgy and Materials Research Institute Ch...,https://api.elsevier.com/content/affiliation/a...,Thailand
2,2-s2.0-85058392278,Arica,60030782,Universidad de Tarapacá,https://api.elsevier.com/content/affiliation/a...,Chile
3,2-s2.0-85058392278,Qinhuangdao,60018465,Yanshan University,https://api.elsevier.com/content/affiliation/a...,China
4,2-s2.0-85059476683,Bangkok,60028190,Chulalongkorn University,https://api.elsevier.com/content/affiliation/a...,Thailand
...,...,...,...,...,...,...
12625,2-s2.0-85138187650,Mumbai,60014153,Indian Institute of Technology Bombay,https://api.elsevier.com/content/affiliation/a...,India
12626,2-s2.0-85138187650,Hangzhou,60003970,Zhejiang University,https://api.elsevier.com/content/affiliation/a...,China
12627,2-s2.0-85119492522,Bangkok,60022183,King Chulalongkorn Memorial Hospital,https://api.elsevier.com/content/affiliation/a...,Thailand
12628,2-s2.0-85119492522,London,60011326,UCL Ear Institute,https://api.elsevier.com/content/affiliation/a...,United Kingdom


In [18]:
paper_to_affiliations_df = affiliations_df.loc[:, ['paper_id', '@id']]
paper_to_affiliations_df.rename(columns={
    '@id': 'affiliation_id'
}, inplace=True)
paper_to_affiliations_df

Unnamed: 0,paper_id,affiliation_id
0,2-s2.0-85058392278,60110787
1,2-s2.0-85058392278,60091507
2,2-s2.0-85058392278,60030782
3,2-s2.0-85058392278,60018465
4,2-s2.0-85059476683,60028190
...,...,...
12625,2-s2.0-85138187650,60014153
12626,2-s2.0-85138187650,60003970
12627,2-s2.0-85119492522,60022183
12628,2-s2.0-85119492522,60011326


In [19]:
affiliations_df = affiliations_df.drop(columns=['paper_id']).drop_duplicates().reset_index(drop=True)
affiliations_df

Unnamed: 0,affiliation-city,@id,affilname,@href,affiliation-country
0,Rayong,60110787,Vidyasirimedhi Institute of Science and Techno...,https://api.elsevier.com/content/affiliation/a...,Thailand
1,Bangkok,60091507,Metallurgy and Materials Research Institute Ch...,https://api.elsevier.com/content/affiliation/a...,Thailand
2,Arica,60030782,Universidad de Tarapacá,https://api.elsevier.com/content/affiliation/a...,Chile
3,Qinhuangdao,60018465,Yanshan University,https://api.elsevier.com/content/affiliation/a...,China
4,Bangkok,60028190,Chulalongkorn University,https://api.elsevier.com/content/affiliation/a...,Thailand
...,...,...,...,...,...
1946,Shanghai,60112527,College of Sciences,https://api.elsevier.com/content/affiliation/a...,China
1947,Fortaleza,60015440,Universidade de Fortaleza,https://api.elsevier.com/content/affiliation/a...,Brazil
1948,Bangkok,126790845,Academy of Science,https://api.elsevier.com/content/affiliation/a...,Thailand
1949,Chetumal,60276903,El Colegio de la Frontera Sur,https://api.elsevier.com/content/affiliation/a...,Mexico


### References

In [20]:
references_df = df.explode('references', ignore_index=True)[['id', 'references']]
references_df.rename(columns={
    'id': 'paper_id',
    'references': 'reference'
}, inplace=True)
references_df

Unnamed: 0,paper_id,reference
0,2-s2.0-85058392278,"{'ref-fulltext': 'T. Inoue, A. Fujishima, S. K..."
1,2-s2.0-85058392278,"{'ref-fulltext': 'S. N. Frank, A. J. Bard, J. ..."
2,2-s2.0-85058392278,"{'ref-fulltext': 'S. A. Grinshpun, A. Adhikari..."
3,2-s2.0-85058392278,"{'ref-fulltext': 'C. Wei, W. Y. Lin, Z. Zainal..."
4,2-s2.0-85058392278,"{'ref-fulltext': 'S. In, A. Orlov, R. Berg, F...."
...,...,...
45386,2-s2.0-85119492522,"{'ref-fulltext': 'Yanilmaz, M., Akduman, D., S..."
45387,2-s2.0-85119492522,"{'ref-fulltext': 'Yildirim, M.A., Karlidag, T...."
45388,2-s2.0-85119492522,"{'ref-fulltext': 'Karlidag, T., Yildiz, M., Ya..."
45389,2-s2.0-85119492522,"{'ref-fulltext': 'Xiong, Y., Rabchevsky, A.G.,..."


In [21]:
references_df[references_df['paper_id'] == '2-s2.0-85106046890']

Unnamed: 0,paper_id,reference


In [22]:
references_df = references_df.join(pd.json_normalize(references_df["reference"])).drop(columns=['reference'])
references_df

Unnamed: 0,paper_id,ref-fulltext,@id,ref-info.ref-publicationyear.@first,ref-info.refd-itemidlist.itemid,ref-info.ref-volisspag.voliss.@volume,ref-info.ref-volisspag.pagerange.@first,ref-info.ref-authors.author,ref-info.ref-sourcetitle,ref-info.ref-volisspag.pagerange.@last,...,ref-info.ref-authors.et-al,@aii:was-generated-by,@reference-instance-id,ref-info.ref-volisspag.pages,ref-info.ref-publicationyear.@last,ref-info.ref-website.websitename,@date-locked,ref-info.ref-authors.collaboration,ref-info.ref-volisspag.pagecount.$,ref-info.ref-volisspag.pagecount.@type
0,2-s2.0-85058392278,"T. Inoue, A. Fujishima, S. Konishi, K. Honda, ...",1,1979,"[{'$': 'cssc201802284-cit-0001', '@idtype': 'F...",277,637,"[{'@seq': '1', 'ce:initials': 'T.', '@_fa': 't...",Nature,,...,,,,,,,,,,
1,2-s2.0-85058392278,"S. N. Frank, A. J. Bard, J. Am. Chem. Soc. 197...",2,1977,"[{'$': 'cssc201802284-cit-0002', '@idtype': 'F...",99,303,"[{'@seq': '1', 'ce:initials': 'S.N.', '@_fa': ...",J. Am. Chem. Soc.,304,...,,,,,,,,,,
2,2-s2.0-85058392278,"S. A. Grinshpun, A. Adhikari, T. Honda, K. Y. ...",3,2007,"[{'$': 'cssc201802284-cit-0003', '@idtype': 'F...",41,606,"[{'@seq': '1', 'ce:initials': 'S.A.', '@_fa': ...",Environ. Sci. Technol.,612,...,,,,,,,,,,
3,2-s2.0-85058392278,"C. Wei, W. Y. Lin, Z. Zainal, N. E. Williams, ...",4,1994,"[{'$': 'cssc201802284-cit-0004', '@idtype': 'F...",28,934,"[{'@seq': '1', 'ce:initials': 'C.', '@_fa': 't...",Environ. Sci. Technol.,938,...,,,,,,,,,,
4,2-s2.0-85058392278,"S. In, A. Orlov, R. Berg, F. García, S. Pedros...",5,2007,"[{'$': 'cssc201802284-cit-0005', '@idtype': 'F...",129,13790,"[{'@seq': '1', 'ce:initials': 'S.', '@_fa': 't...",J. Am. Chem. Soc.,13791,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45386,2-s2.0-85119492522,"Yanilmaz, M., Akduman, D., Sagun, ÖF., Hakseve...",29,2015,"[{'$': 'bib0145', '@idtype': 'FRAGMENTID'}, {'...",26,667,"[{'@seq': '1', 'ce:initials': 'M.', '@_fa': 't...",J Craniofac Surg.,672,...,,,,,,,,,,
45387,2-s2.0-85119492522,"Yildirim, M.A., Karlidag, T., Akpolat, N., Kay...",30,2015,"[{'$': 'bib0150', '@idtype': 'FRAGMENTID'}, {'...",26,810,"[{'@seq': '1', 'ce:initials': 'M.A.', '@_fa': ...",J Craniofac Surg.,815,...,,,,,,,,,,
45388,2-s2.0-85119492522,"Karlidag, T., Yildiz, M., Yalcin, S., Colakogl...",31,2012,"[{'$': 'bib0155', '@idtype': 'FRAGMENTID'}, {'...",39,145,"[{'@seq': '1', 'ce:initials': 'T.', '@_fa': 't...",Auris Nasus Larynx.,150,...,,,,,,,,,,
45389,2-s2.0-85119492522,"Xiong, Y., Rabchevsky, A.G., Hall, E.D., Role ...",32,2007,"[{'$': 'bib0160', '@idtype': 'FRAGMENTID'}, {'...",100,639,"[{'@seq': '1', 'ce:initials': 'Y.', '@_fa': 't...",J Neurochem.,649,...,,,,,,,,,,


In [23]:
references_df[references_df['ref-info.ref-title.ref-titletext'] == 'What is the Impact of International Remittances on Poverty and Inequality in Latin America?']

Unnamed: 0,paper_id,ref-fulltext,@id,ref-info.ref-publicationyear.@first,ref-info.refd-itemidlist.itemid,ref-info.ref-volisspag.voliss.@volume,ref-info.ref-volisspag.pagerange.@first,ref-info.ref-authors.author,ref-info.ref-sourcetitle,ref-info.ref-volisspag.pagerange.@last,...,ref-info.ref-authors.et-al,@aii:was-generated-by,@reference-instance-id,ref-info.ref-volisspag.pages,ref-info.ref-publicationyear.@last,ref-info.ref-website.websitename,@date-locked,ref-info.ref-authors.collaboration,ref-info.ref-volisspag.pagecount.$,ref-info.ref-volisspag.pagecount.@type


In [24]:
references_df['ref-fulltext'].value_counts()

ref-fulltext
CMS collaboration, The CMS trigger system, 2017 JINST 12 P01020 [arXiv:1609.02366] [INSPIRE].                                                                                                                                                                                                                                                                                    11
CMS collaboration, Event generator tunes obtained from underlying event and multiparton scattering measurements, Eur. Phys. J. C 76 (2016) 155 [arXiv:1512.00815] [INSPIRE].                                                                                                                                                                                                     10
CMS collaboration, Particle-flow reconstruction and global event description with the CMS detector, 2017 JINST 12 P10003 [arXiv:1706.04965] [INSPIRE].                                                                                             

### Keywords

In [25]:
keywords_df = df.explode('keywords', ignore_index=True)[['id', 'keywords']]
keywords_df.rename(columns={
    'id': 'paper_id',
    'keywords': 'keyword'
}, inplace=True)
keywords_df

Unnamed: 0,paper_id,keyword
0,2-s2.0-85058392278,"{'@_fa': 'true', '$': '2D materials'}"
1,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'graphitic carbon nitride'}"
2,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'heterostructured compos..."
3,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'hydrogen evolution'}"
4,2-s2.0-85058392278,"{'@_fa': 'true', '$': 'photocatalysis'}"
...,...,...
4408,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Complete facial nerve t..."
4409,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Corticosteroid'}"
4410,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Facial nerve neurorrhap..."
4411,2-s2.0-85119492522,"{'@_fa': 'true', '$': 'Functional recovery'}"


In [26]:
keywords_df = keywords_df.join(pd.json_normalize(keywords_df["keyword"])).drop(columns=['keyword'])
keywords_df

Unnamed: 0,paper_id,@_fa,$
0,2-s2.0-85058392278,true,2D materials
1,2-s2.0-85058392278,true,graphitic carbon nitride
2,2-s2.0-85058392278,true,heterostructured composites
3,2-s2.0-85058392278,true,hydrogen evolution
4,2-s2.0-85058392278,true,photocatalysis
...,...,...,...
4408,2-s2.0-85119492522,true,Complete facial nerve transection
4409,2-s2.0-85119492522,true,Corticosteroid
4410,2-s2.0-85119492522,true,Facial nerve neurorrhaphy
4411,2-s2.0-85119492522,true,Functional recovery


In [27]:
keywords_df = keywords_df.loc[:, ['paper_id', '$']]
keywords_df.rename(columns={
    '$': 'keyword'
}, inplace=True)
keywords_df

Unnamed: 0,paper_id,keyword
0,2-s2.0-85058392278,2D materials
1,2-s2.0-85058392278,graphitic carbon nitride
2,2-s2.0-85058392278,heterostructured composites
3,2-s2.0-85058392278,hydrogen evolution
4,2-s2.0-85058392278,photocatalysis
...,...,...
4408,2-s2.0-85119492522,Complete facial nerve transection
4409,2-s2.0-85119492522,Corticosteroid
4410,2-s2.0-85119492522,Facial nerve neurorrhaphy
4411,2-s2.0-85119492522,Functional recovery


In [28]:
paper_to_keywords_df = keywords_df

In [29]:
keywords_df = keywords_df.drop(columns=['paper_id']).drop_duplicates()
keywords_df

Unnamed: 0,keyword
0,2D materials
1,graphitic carbon nitride
2,heterostructured composites
3,hydrogen evolution
4,photocatalysis
...,...
4408,Complete facial nerve transection
4409,Corticosteroid
4410,Facial nerve neurorrhaphy
4411,Functional recovery


## Buckets (for multiprocessing)

In [30]:
buckets = {i: [] for i in range(12)}

for path in pathlib.Path("Data 2018-2023/Project").glob("*/*"):
    buckets[path.__hash__() % 12].append(path)

# buckets