In [99]:
import numpy as np
import pandas as pd
import json

from pathlib import Path  


In [100]:
data = []

In [101]:
with open('./DATA/JSON/all_data.json') as d:
    data = json.load(d)
    
rows = []
subj = []
keywords = []
affiliations = []
funds = []

for record in data:
    try:
        title = record['abstracts-retrieval-response']['coredata']['dc:title']
        publish_year = record['abstracts-retrieval-response']['item']['ait:process-info']['ait:date-sort']['@year']
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        aggregation_type = record['abstracts-retrieval-response']['coredata']['prism:aggregationType']
        authors = record['abstracts-retrieval-response']['authors']['author']
        author_count = len(authors)
        if record['abstracts-retrieval-response']['language']:
            language = record['abstracts-retrieval-response']['language']['@xml:lang']
        else: language = None
        
        rows.append({
            'Title': title,
            'Publish_year': publish_year,
            'Eid': eid,
            'Aggregation_type': aggregation_type,
            'Author_Count': author_count,
            'Language' : language
        })
        if "subject-areas" in record['abstracts-retrieval-response']:
            areas = record['abstracts-retrieval-response']['subject-areas']['subject-area']
            for area in areas:
                subj.append({
                    'Eid': eid,
                    'Subject_Area': area["$"] if "$" in area else None,
                    'Subject_Code': area["@code"] if "@code" in area else None,
                })
        
        if "author-keywords" in record['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']:
            keyword = record['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']['author-keywords']['author-keyword']
            for key in keyword:
                keywords.append({
                    'Eid':eid,
                    'Keyword': key["$"] if "$" in key else None,
                    'Language':key["@xml:lang"] if "@xml:lang" in key else None,
                    'Original':key["@original"] if "@original" in key else None
                })
            
        if "affiliation" in record['abstracts-retrieval-response']:
            affils = record['abstracts-retrieval-response']['affiliation']
            for affil in affils:
                affiliations.append({
                    'Eid': eid,
                    'Affil_Id':affil["@id"] if "@id" in affil else None,
                    'Affil_Name':affil["affilname"] if "affilname" in affil else None,
                    'Affil_Country':affil["affiliation-country"] if "affiliation-country" in affil else None,
                    'Affil_City':affil["affiliation-city"] if "affiliation-city" in affil else None
                })
        
        if "@has-funding-info" in record['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']:
            if "xocs:funding" in record['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']:
                fundings = record['abstracts-retrieval-response']['item']['xocs:meta']['xocs:funding-list']['xocs:funding']
                for funding in fundings:
                    funds.append({
                        'Eid' : eid,
                        'agency-matched-string' : funding["xocs:funding-agency-matched-string"] if "xocs:funding-agency-matched-string" in funding else None,
                        'agency-acronym': funding["xocs:funding-agency-acronym"] if "xocs:funding-agency-acronym" in funding else None,
                        'agency': funding["xocs:funding-agency"] if "xocs:funding-agency" in funding else None,
                        'agency-id': funding["xocs:funding-agency-id"] if "xocs:funding-agency-id" in funding else None,
                        'agency-country': funding["xocs:funding-agency-country"] if "xocs:funding-agency-country" in funding else None
                    })
            else:
                funds.append({
                    'Eid' : eid,
                    'agency-matched-string' : None,
                    'agency-acronym': None,
                    'agency': None,
                    'agency-id': None,
                    'agency-country': None
                })
    except:
        pass

df = pd.DataFrame(rows)

subject_df = pd.DataFrame(subj)

keyword_df = pd.DataFrame(keywords)

affil_df = pd.DataFrame(affiliations)

funds_df = pd.DataFrame(funds)

In [102]:
df = df.drop_duplicates()
subject_df = subject_df.drop_duplicates()
keyword_df = keyword_df.drop_duplicates()
affil_df = affil_df.drop_duplicates()
funds_df = funds_df.drop_duplicates()

In [103]:
df

Unnamed: 0,Title,Publish_year,Eid,Aggregation_type,Author_Count,Language
0,Effects of iron content on the microstructure ...,2018,2-s2.0-85053164279,Journal,7,eng
1,The critical factors of research and innovatio...,2018,2-s2.0-85049101440,Journal,3,eng
2,Is the occiput-wall distance valid and reliabl...,2018,2-s2.0-85054140369,Journal,8,eng
3,Comparison of soil composition between farmlan...,2018,2-s2.0-85097515350,Journal,2,eng
4,The impact of wire caliber on ERCP outcomes: a...,2018,2-s2.0-85041527766,Journal,13,eng
...,...,...,...,...,...,...
20211,A Techno-Economic Assessment of a Second-Life ...,2023,2-s2.0-85152540548,Journal,3,eng
20212,Encouraging green product purchase: Green valu...,2023,2-s2.0-85132634561,Journal,2,eng
20213,Does leukocytosis remain a predictive factor f...,2023,2-s2.0-85150789915,Journal,16,eng
20214,Administration of ketoprofen in postpartum sow...,2023,2-s2.0-85165609857,Journal,8,eng


In [104]:
df.isnull().sum()

Title                 0
Publish_year          0
Eid                   0
Aggregation_type      0
Author_Count          0
Language            120
dtype: int64

In [105]:
subject_df

Unnamed: 0,Eid,Subject_Area,Subject_Code
0,2-s2.0-85053164279,Materials Science (all),2500
1,2-s2.0-85053164279,Condensed Matter Physics,3104
2,2-s2.0-85049101440,Business and International Management,1403
3,2-s2.0-85049101440,"Economics, Econometrics and Finance (all)",2000
4,2-s2.0-85054140369,"Physical Therapy, Sports Therapy and Rehabilit...",3612
...,...,...,...
50059,2-s2.0-85165609857,Genetics,1311
50060,2-s2.0-85165609857,Veterinary (all),3400
50061,2-s2.0-85164405594,Language and Linguistics,1203
50062,2-s2.0-85164405594,Linguistics and Language,3310


In [106]:
subject_df.isnull().sum()

Eid             0
Subject_Area    0
Subject_Code    0
dtype: int64

In [107]:
keyword_df

Unnamed: 0,Eid,Keyword,Language,Original
0,2-s2.0-85053164279,EIS,eng,
1,2-s2.0-85053164279,Microstructure,eng,
2,2-s2.0-85053164279,Pitting corrosion,eng,
3,2-s2.0-85053164279,Polarization,eng,
4,2-s2.0-85053164279,Titanium alloy,eng,
...,...,...,...,...
83027,2-s2.0-85164405594,anxiety,eng,y
83028,2-s2.0-85164405594,cultural beliefs,eng,y
83029,2-s2.0-85164405594,fear of committing errors,eng,y
83030,2-s2.0-85164405594,Jordanian EFL learners,eng,y


In [108]:
keyword_df.isnull().sum()

Eid             0
Keyword         0
Language        0
Original    24797
dtype: int64

In [109]:
affil_df

Unnamed: 0,Eid,Affil_Id,Affil_Name,Affil_Country,Affil_City
0,2-s2.0-85053164279,60091507,Metallurgy and Materials Research Institute Ch...,Thailand,Bangkok
1,2-s2.0-85053164279,60018465,Yanshan University,China,Qinhuangdao
2,2-s2.0-85054140369,60017165,Khon Kaen University,Thailand,Khon Kaen
3,2-s2.0-85054140369,60092136,Rajamangala University of Technology Isan,Thailand,Nakhon Ratchasima
4,2-s2.0-85054140369,60028190,Chulalongkorn University,Thailand,Bangkok
...,...,...,...,...,...
197265,2-s2.0-85150789915,60006964,Phramongkutklao College of Medicine,Thailand,Bangkok
197266,2-s2.0-85150789915,60005255,Rajavithi Hospital,Thailand,Bangkok
197267,2-s2.0-85150789915,60002620,"Faculty of Medicine, Chulalongkorn University",Thailand,Bangkok
197268,2-s2.0-85164405594,60105841,Al-Hussein Bin Talal University,Jordan,Ma'an


In [110]:
affil_df.isnull().sum()

Eid                0
Affil_Id           0
Affil_Name         0
Affil_Country    102
Affil_City       847
dtype: int64

In [111]:
funds_df['agency-id'].head(5).to_list()

[None,
 None,
 'http://data.elsevier.com/vocabulary/SciValFunders/501100001809',
 'http://data.elsevier.com/vocabulary/SciValFunders/501100002873',
 'http://data.elsevier.com/vocabulary/SciValFunders/501100008095']

In [112]:
funds_df.head(5)

Unnamed: 0,Eid,agency-matched-string,agency-acronym,agency,agency-id,agency-country
0,2-s2.0-85053164279,Ratchadapisek Sompoch Endowment Fund,,,,
1,2-s2.0-85053164279,Surface Coatings Technology for Metals and Mat...,,,,
2,2-s2.0-85053164279,NSFC,NSFC,National Natural Science Foundation of China,http://data.elsevier.com/vocabulary/SciValFund...,http://sws.geonames.org/1814991/
3,2-s2.0-85053164279,Chulalongkorn University,CU,Chulalongkorn University,http://data.elsevier.com/vocabulary/SciValFund...,http://sws.geonames.org/1605651/
4,2-s2.0-85053164279,Yanshan University,YSU,Yanshan University,http://data.elsevier.com/vocabulary/SciValFund...,http://sws.geonames.org/1814991/


In [113]:
funding_name_list = funds_df[funds_df['agency-id'].isna()][['agency-matched-string']]['agency-matched-string'].drop_duplicates().dropna().reset_index().drop(columns='index')

In [129]:
funds_df.to_csv('aj_funding_data.csv', index=False)

In [114]:
# funding_name_list.to_csv('funding_name_list.csv', index=False)

In [115]:
funds_df.isnull().sum()

Eid                          0
agency-matched-string     3045
agency-acronym           27394
agency                   20554
agency-id                20342
agency-country           20615
dtype: int64

In [116]:
funds_df['agency-matched-string'].value_counts()

agency-matched-string
Chulalongkorn University                                4226
Thailand Research Fund                                  1405
National Research Council of Thailand                   1060
Second Century Fund                                      568
NSTDA                                                    516
                                                        ... 
Solar Energy Technology Office                             1
Villum Foundation                                          1
National Renewable Energy Laboratory                       1
United States – Israel Binational Science Foundation       1
Editor-in-Chief of Hematology                              1
Name: count, Length: 10131, dtype: int64

In [117]:
funds_df['agency-acronym'].value_counts()

agency-acronym
CU            4358
TRF           1501
NRCT          1147
สวทช           843
MOST           619
              ... 
FU Berlin        1
NSTDA;สวทช       1
GoK              1
CCH              1
SNRU             1
Name: count, Length: 1655, dtype: int64

In [118]:
funds_df['Eid'].value_counts()

Eid
2-s2.0-85123911583    162
2-s2.0-85048748130    146
2-s2.0-85072850749    141
2-s2.0-85070317489    139
2-s2.0-85072183946    138
                     ... 
2-s2.0-85145168470      1
2-s2.0-85066625830      1
2-s2.0-85140446464      1
2-s2.0-85055718889      1
2-s2.0-85117535862      1
Name: count, Length: 10535, dtype: int64

In [119]:
# Find rows where all fields except 'Eid' are None
filtered_df = funds_df[funds_df.drop(columns=['Eid']).isna().all(axis=1)]

filtered_df


Unnamed: 0,Eid,agency-matched-string,agency-acronym,agency,agency-id,agency-country
10,2-s2.0-85041527766,,,,,
125,2-s2.0-85020664359,,,,,
240,2-s2.0-85062171866,,,,,
252,2-s2.0-85049301587,,,,,
253,2-s2.0-85050807596,,,,,
...,...,...,...,...,...,...
74268,2-s2.0-85133588862,,,,,
74338,2-s2.0-85162092675,,,,,
74384,2-s2.0-85165910707,,,,,
74395,2-s2.0-85151554103,,,,,


In [120]:
subject_df['Subject_Area'].value_counts()

Subject_Area
Multidisciplinary             1088
Materials Science (all)        907
Chemistry (all)                905
Chemical Engineering (all)     755
Infectious Diseases            753
                              ... 
Museology                        1
Emergency Nursing                1
Chiropractics                    1
Optometry                        1
Family Practice                  1
Name: count, Length: 321, dtype: int64

In [121]:
subject_df['Subject_Code'].value_counts()

Subject_Code
1000    1088
2500     907
1600     905
1500     755
2725     753
        ... 
1209       1
2907       1
3602       1
3610       1
2714       1
Name: count, Length: 321, dtype: int64

In [122]:
merge_table = df.merge(subject_df,on='Eid',how='left')
merge_table

Unnamed: 0,Title,Publish_year,Eid,Aggregation_type,Author_Count,Language,Subject_Area,Subject_Code
0,Effects of iron content on the microstructure ...,2018,2-s2.0-85053164279,Journal,7,eng,Materials Science (all),2500
1,Effects of iron content on the microstructure ...,2018,2-s2.0-85053164279,Journal,7,eng,Condensed Matter Physics,3104
2,The critical factors of research and innovatio...,2018,2-s2.0-85049101440,Journal,3,eng,Business and International Management,1403
3,The critical factors of research and innovatio...,2018,2-s2.0-85049101440,Journal,3,eng,"Economics, Econometrics and Finance (all)",2000
4,Is the occiput-wall distance valid and reliabl...,2018,2-s2.0-85054140369,Journal,8,eng,"Physical Therapy, Sports Therapy and Rehabilit...",3612
...,...,...,...,...,...,...,...,...
50059,Administration of ketoprofen in postpartum sow...,2023,2-s2.0-85165609857,Journal,8,eng,Genetics,1311
50060,Administration of ketoprofen in postpartum sow...,2023,2-s2.0-85165609857,Journal,8,eng,Veterinary (all),3400
50061,Role of Anxiety in Willingness to Communicate ...,2023,2-s2.0-85164405594,Journal,3,eng,Language and Linguistics,1203
50062,Role of Anxiety in Willingness to Communicate ...,2023,2-s2.0-85164405594,Journal,3,eng,Linguistics and Language,3310


In [123]:
df.isnull().sum()

Title                 0
Publish_year          0
Eid                   0
Aggregation_type      0
Author_Count          0
Language            120
dtype: int64

In [124]:
# filepath = Path('DATA/main.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# df.to_csv(filepath, index=False) 

In [125]:
# filepath = Path('DATA/subject.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# subject_df.to_csv(filepath, index=False) 

In [126]:
# filepath = Path('DATA/keyword.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# keyword_df.to_csv(filepath, index=False) 

In [127]:
# filepath = Path('DATA/funding.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# funds_df.to_csv(filepath, index=False) 