In [1]:
import json
import pandas as pd

In [2]:
# JSON에서 필드 추출 후 DataFrame으로 변환하는 함수
def work_preprocessing(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    rows = []
    for work in json_data:
        authorships = work.get("authorships", [])
        first_author = authorships[0] if len(authorships) > 0 else None
        first_institution = (
            first_author.get("institutions", [])[0]
            if first_author and first_author.get("institutions")
            else None
        )

        row = {
            "work_id": work.get("id"),
            "title": work.get("title"),
            "doi": work.get("doi"),
            "publication_date": work.get("publication_date"),
            "type": work.get("type"),
            "is_oa": work.get("open_access", {}).get("is_oa"),
            "primary_topic_id": work.get("primary_topic", {}).get("id"),
            "domain": work.get("primary_topic", {}).get("domain", {}).get("display_name"),
            "field": work.get("primary_topic", {}).get("field", {}).get("display_name"),
            "subfield": work.get("primary_topic", {}).get("subfield", {}).get("display_name"),
            "citations_past_decade": work.get("cited_by_count"),
            "created_date": work.get("created_date"),
            "first_institution_id": first_institution.get("id") if first_institution else None,
            "first_author_id": first_author.get("author", {}).get("id") if first_author else None,
            "keywords": ", ".join([k.get("display_name", "") for k in work.get("keywords", [])]) if work.get("keywords") else None
        }
        rows.append(row)

    return pd.DataFrame(rows)

In [3]:
file_list = [
    r'D:\devcourse\session2\project\works_20250801_20250820.json',
    r'D:\devcourse\session2\project\works_20250821_20250910.json',
    r'D:\devcourse\session2\project\works_20250911_20250930.json',
    r'D:\devcourse\session2\project\works_20251001_20251022.json'
]

In [4]:
works_list = [work_preprocessing(file) for file in file_list]

In [5]:
works = pd.concat(works_list, ignore_index=True)

In [6]:
works["work_id"] = works["work_id"].str.split("/").str[-1]
works["primary_topic_id"] = works["primary_topic_id"].str.split("/").str[-1]
works["first_institution_id"] = works["first_institution_id"].str.split("/").str[-1]
works["first_author_id"] = works["first_author_id"].str.split("/").str[-1]


In [7]:
works['publication_date'] = pd.to_datetime(works['publication_date'], errors='coerce')
works['created_date'] = pd.to_datetime(works['created_date'], errors='coerce')


In [8]:
works.head()

Unnamed: 0,work_id,title,doi,publication_date,type,is_oa,primary_topic_id,domain,field,subfield,citations_past_decade,created_date,first_institution_id,first_author_id,keywords
0,W2419652933,A Critical Appraisal of the Utility of the Ser...,https://doi.org/10.64719/pb.4221,2025-08-12,article,False,T10211,Physical Sciences,Computer Science,Computational Theory and Mathematics,67,2016-06-24,I126307644,A5037478520,"Critical appraisal, Clinical Practice"
1,W63003660,Introduction to Digital Communications,https://doi.org/10.1016/c2023-0-01098-6,2025-08-08,book,False,T13816,Physical Sciences,Engineering,Electrical and Electronic Engineering,27,2016-06-24,,A5041929088,
2,W4413247502,GetDist: a Python package for analysing Monte ...,https://doi.org/10.1088/1475-7516/2025/08/025,2025-08-01,article,True,T10136,Physical Sciences,Mathematics,Statistics and Probability,25,2025-08-17,,A5080085506,"Python, Smoothing"
3,W3187640320,Blood donation management system,https://doi.org/10.56726/irjmets81722,2025-08-06,article,False,T12735,Social Sciences,"Business, Management and Accounting",Management of Technology and Innovation,20,2021-08-16,,A5055236967,"Blood bank, Blood collection"
4,W3094967386,Use Cases for In-Network Computing,https://doi.org/10.17487/rfc9817,2025-08-01,report,False,T10714,Physical Sciences,Computer Science,Computer Networks and Communications,17,2020-11-09,,A5057626571,"Interface (matter), Network interface"


In [9]:
works.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147037 entries, 0 to 147036
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   work_id                147037 non-null  object        
 1   title                  147036 non-null  object        
 2   doi                    146470 non-null  object        
 3   publication_date       147037 non-null  datetime64[ns]
 4   type                   147037 non-null  object        
 5   is_oa                  147037 non-null  bool          
 6   primary_topic_id       147037 non-null  object        
 7   domain                 147037 non-null  object        
 8   field                  147037 non-null  object        
 9   subfield               147037 non-null  object        
 10  citations_past_decade  147037 non-null  int64         
 11  created_date           147037 non-null  datetime64[ns]
 12  first_institution_id   57100 non-null   obje

In [16]:
works[works['work_id'] == "W4414430316"]


Unnamed: 0,work_id,title,doi,publication_date,type,is_oa,primary_topic_id,domain,field,subfield,citations_past_decade,created_date,first_institution_id,first_author_id,keywords
35074,W4414430316,Modality-Specific Speech Enhancement and Noise...,https://doi.org/10.21437/interspeech.2025-2581,2025-08-17,article,True,T10860,Physical Sciences,Computer Science,Signal Processing,0,2025-09-23,,A5060592139,Modality (human–computer interaction)


In [14]:
works.drop(index=65461, inplace=True)
works.reset_index(drop=True, inplace=True)

In [15]:
works.to_csv('works.csv', index=False, date_format='%Y-%m-%d')