In [1]:

import os
import json
import pandas as pd
from tqdm import tqdm
from pyalex import Authors

In [None]:
import pyalex

pyalex.config.email = "sosoj1552@gmailc.com"

# Work의 Author 정보를 수집한다
1. 수집된 Work 데이터를 기준으로 WorkAuthor, WordTopic 데이터 생성
2. WorkAuthor의 Author ID를 distinct 하기


In [68]:
work_data_path = "./raw_json_data"
print(os.listdir(work_data_path))
for filename in os.listdir(work_data_path):
    full_filename = os.path.join(work_data_path, filename)
    print(full_filename)


['works_20250801_20250820.json', 'works_20250821_20250910.json', 'works_20250911_20250930.json', 'works_20251001_20251022.json']
./raw_json_data\works_20250801_20250820.json
./raw_json_data\works_20250821_20250910.json
./raw_json_data\works_20250911_20250930.json
./raw_json_data\works_20251001_20251022.json


## WorkAuthor, WorkTopic : CSV 저장

In [90]:
def get_author_topic(data):
    wa_lst = []
    wt_lst  = []
    for d in data:
        work_id = d.get('id').split('/')[-1].strip()
        if d['authorships'] == [] :
            # 저자 정보가 없다면 None값으로 우선 추가
            authors = [(work_id, None, None), ]
        else :
            authors = [(work_id, a['author']['id'].split('/')[-1].strip(), a['author_position']) for a in d['authorships']]
        
        if d['topics'] == [] :
            topics = [(work_id, None, None, None),]
        else :
            topics  = [(work_id, t['id'].split('/')[-1].strip(), t['score'], i+1) for i, t in enumerate(d['topics'])]
        
        wa_lst.extend(authors)
        wt_lst.extend(topics)
    return wa_lst, wt_lst

In [91]:
work_data_path = "./raw_json_data"
print(os.listdir(work_data_path))

all_was = []
all_wts = []
for filename in tqdm(os.listdir(work_data_path)):
    full_filename = os.path.join(work_data_path, filename)
    
    with open(full_filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # works 데이터에서 Author, Topic 정보 추출
    was, wts = get_author_topic(data)
    
    all_was.extend(was)
    all_wts.extend(wts)
    
wa_df = pd.DataFrame(all_was, columns=['work_id', 'author_id', 'author_position'])
wt_df = pd.DataFrame(all_wts, columns=['work_id', 'topic_id', 'score', 'order'])

['works_20250801_20250820.json', 'works_20250821_20250910.json', 'works_20250911_20250930.json', 'works_20251001_20251022.json']


100%|██████████| 4/4 [01:08<00:00, 17.21s/it]


In [92]:
wa_df.shape

(556195, 3)

In [97]:
wa_df.head()

Unnamed: 0,work_id,author_id,author_position
0,W2419652933,A5037478520,first
1,W2419652933,A5037119672,middle
2,W2419652933,A5112156725,middle
3,W2419652933,A5000921316,last
4,W63003660,A5041929088,first


In [100]:
print(f"수집된 논문 수 : {wa_df.work_id.nunique()}")
print(f"저자 정보가 없는 논문 수 : {wa_df[wa_df['author_id'].isnull()].work_id.nunique()}")

수집된 논문 수 : 147036
저자 정보가 없는 논문 수 : 4242


In [None]:
# 저자 정보가 있는 논문 저장
wa_df[~wa_df['author_id'].isnull()]\
        .to_csv("./transform_csv_data/work_author.csv", index=False, encoding='utf-8')

# 저자 정보가 없는 논문 저장
wa_df[wa_df['author_id'].isnull()]\
        .to_csv("./transform_csv_data/tmp_work_without_author_.csv", index=False, encoding='utf-8')

In [93]:
wt_df.shape

(392206, 4)

In [96]:
wt_df.head()

Unnamed: 0,work_id,topic_id,score,order
0,W2419652933,T10211,0.9208,1
1,W63003660,T13816,0.6802,1
2,W63003660,T13983,0.6758,2
3,W4413247502,T10136,0.9969,1
4,W4413247502,T12814,0.9965,2


In [103]:
print(f"수집된 논문 수 : {wt_df.work_id.nunique()}")
print(f"토픽 정보가 없는 논문 수 : {wt_df[wt_df['topic_id'].isnull()].work_id.nunique()}")

수집된 논문 수 : 147036
토픽 정보가 없는 논문 수 : 0


In [109]:
# 저자-토픽
wt_df\
    .to_csv("./transform_csv_data/work_topic.csv", index=False, encoding='utf-8')