In [1]:
import os
import json
import pandas as pd
from glob import glob

In [2]:
# Get a list of all JSON files in the folder
file_paths2023 = glob(os.path.join('data/2023/','*'))
file_paths2022 = glob(os.path.join('data/2022/','*'))
file_paths2021 = glob(os.path.join('data/2021/','*'))
file_paths2020 = glob(os.path.join('data/2020/','*'))
file_paths2019 = glob(os.path.join('data/2019/','*'))
file_paths2018 = glob(os.path.join('data/2018/','*'))

file_paths = file_paths2023 + file_paths2022 + file_paths2021 + file_paths2020 + file_paths2019 + file_paths2018

# Initialize a list to store data from each file
all_data = []

# Loop through each file and load data
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        all_data.append(data)

In [14]:
# Initialize a list to hold extracted data
subject_data = []

# Loop through each loaded JSON data and extract relevant fields
for data in all_data:
    response = data.get('abstracts-retrieval-response', {})
    coredata = response.get('coredata', {})
    title = coredata.get('dc:title', '').replace(',', '').replace('"', '').replace("'", '')
    subject_areas = response.get('subject-areas', {})
    subject_area = subject_areas.get("subject-area", [])
    
    for area in subject_area:
        subject_data.append({
            'title': title.lower(),
            'code': area.get('@code', '').replace(',', '').replace('"', '').replace("'", ''),
        })

# Convert to DataFrame
df_subject = pd.DataFrame(subject_data)
df_subject.head(10)

Unnamed: 0,title,code
0,graphene oxide-alginate hydrogel-based indicat...,1315
1,graphene oxide-alginate hydrogel-based indicat...,1303
2,graphene oxide-alginate hydrogel-based indicat...,1312
3,rare coordination behavior of triethanolamine ...,1602
4,rare coordination behavior of triethanolamine ...,1607
5,rare coordination behavior of triethanolamine ...,1605
6,rare coordination behavior of triethanolamine ...,1604
7,total ammonia nitrogen removal and microbial c...,1104
8,effects of microaeration and sludge recirculat...,2305
9,effects of microaeration and sludge recirculat...,2304


In [8]:
df_subject.shape

(50064, 2)

In [15]:
df_subject.describe()

Unnamed: 0,title,code
count,50064,50064
unique,20127,321
top,preface,1000
freq,44,1088


In [16]:
df_subject = df_subject.drop_duplicates()
df_subject.shape

(49938, 2)

In [17]:
df_subject = df_subject.reset_index(drop=True)
df_subject.head(10)

Unnamed: 0,title,code
0,graphene oxide-alginate hydrogel-based indicat...,1315
1,graphene oxide-alginate hydrogel-based indicat...,1303
2,graphene oxide-alginate hydrogel-based indicat...,1312
3,rare coordination behavior of triethanolamine ...,1602
4,rare coordination behavior of triethanolamine ...,1607
5,rare coordination behavior of triethanolamine ...,1605
6,rare coordination behavior of triethanolamine ...,1604
7,total ammonia nitrogen removal and microbial c...,1104
8,effects of microaeration and sludge recirculat...,2305
9,effects of microaeration and sludge recirculat...,2304


In [18]:
df_subject.isna().sum()

title    0
code     0
dtype: int64

In [19]:
df_subject.to_csv('data/title_and_subject_code.csv', index=False)