In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
from pyalex import Authors
import time

In [None]:
import pyalex

pyalex.config.email = "sosoj1552@gmail.com"

# Extract : Json 파일 

In [4]:
res_wa_df = pd.read_csv("./transform_csv_data/work_author.csv")

In [6]:
unique_author = res_wa_df.author_id.unique().tolist()
chunk_size = 100 # ValueError: OpenAlex does not support more than 100 ids
chunked_author = [unique_author[i : i+chunk_size] for i in range(0, len(unique_author), chunk_size)]

print(f"Total count of distinct author = {len(unique_author)}")
print(f"# chunks = {len(chunked_author)}")

Total count of distinct author = 454053
# chunks = 4541


In [8]:
part_num = 1
max_retries = 5

results = []
save_path = "./raw_json_data/authors_based_works"

for i, chunk in tqdm(enumerate(chunked_author)):
    success = False
    retry_count = 0
    
    while not success and retry_count < max_retries:
        try : 
            r_a = Authors()[chunk]
            results.extend(r_a)
            success = True  # 성공 시 반복 종료
        except Exception as e:
            retry_count += 1
            wait_time = 5 * retry_count  # 점진적으로 대기 시간 증가
            print(f"\tError 발생 ({retry_count}/{max_retries}) - {wait_time}초 대기 후 재시도...")
            time.sleep(wait_time)

    if not success:
        print(f"[@@@] chunk {i} 처리 실패 - 다음 chunk로 넘어감.")
        continue
        
    if len(results) >= 80000:
        with open(f"{save_path}/part{part_num}.json", "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"\tSaved part {part_num} ({len(results)} records)")
        part_num += 1
        results = []
    time.sleep(1)
    
# 마지막 남은 데이터도 저장
if results:
    with open(f"{save_path}/part{part_num}.json", "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\tSaved final part {part_num} ({len(results)} records)")

801it [52:20,  4.15s/it]

	Saved part 1 (80054 records)


1223it [1:22:35,  4.38s/it]

	Error 발생 (1/5) - 5초 대기 후 재시도...


1307it [1:29:03,  4.27s/it]

	Error 발생 (1/5) - 5초 대기 후 재시도...


1603it [1:52:34,  5.10s/it]

	Saved part 2 (80049 records)


2405it [2:50:35,  4.18s/it]

	Saved part 3 (80063 records)


3207it [3:42:45,  3.48s/it]

	Saved part 4 (80059 records)


4009it [4:34:09,  3.55s/it]

	Saved part 5 (80031 records)


4541it [5:08:22,  4.07s/it]


	Saved final part 6 (52968 records)


# Transform : 필요한 정보만 csv로 저장

In [2]:
work_data_path = "./raw_json_data/authors_based_works"
print(os.listdir(work_data_path))
for filename in os.listdir(work_data_path):
    full_filename = os.path.join(work_data_path, filename)
    print(full_filename)

['part1.json', 'part2.json', 'part3.json', 'part4.json', 'part5.json', 'part6.json']
./raw_json_data/authors_based_works\part1.json
./raw_json_data/authors_based_works\part2.json
./raw_json_data/authors_based_works\part3.json
./raw_json_data/authors_based_works\part4.json
./raw_json_data/authors_based_works\part5.json
./raw_json_data/authors_based_works\part6.json


In [17]:
import os
import json
import pandas as pd
from tqdm import tqdm
import ijson

In [18]:
work_data_path = "./raw_json_data/authors_based_works"
print(os.listdir(work_data_path))

['part1.json', 'part2.json', 'part3.json', 'part4.json', 'part5.json', 'part6.json']


In [19]:
all_authors = []
for filename in tqdm(os.listdir(work_data_path)):
    full_filename = os.path.join(work_data_path, filename)
    
    with open(full_filename, 'r', encoding='utf-8') as f:
        objects = ijson.items(f, "item")
        
        for d in objects :
            if d['last_known_institutions'] == [] :
                lki = None
            else :
                lki = d['last_known_institutions'][0]['id'].split('/')[-1].strip()
            
            author = (d['id'].split('/')[-1].strip(), d['display_name'], d['orcid'], d['works_count'], d['cited_by_count'], lki)
            all_authors.append(author)
    
author_df = pd.DataFrame(all_authors, columns=['author_id', 'author_name', 'orcid', 'works_count', 'cited_by_count', 'institution_id'])

100%|██████████| 6/6 [01:45<00:00, 17.62s/it]


In [20]:
author_df.shape

(453224, 6)

In [21]:
author_df.head()

Unnamed: 0,author_id,author_name,orcid,works_count,cited_by_count,institution_id
0,A5022764544,Jian Ping Gong,https://orcid.org/0000-0003-2228-2750,803,41033,I4210136497
1,A5057784648,Shinya Tanaka,https://orcid.org/0000-0001-6470-3301,750,18477,I60134161
2,A5100637638,Feng Ding,https://orcid.org/0000-0002-2721-2025,719,28316,I4210105877
3,A5000921316,Bruce G. Pollock,https://orcid.org/0000-0003-0802-3998,697,29893,I1338135719
4,A5067842712,Yu Shrike Zhang,https://orcid.org/0000-0002-0045-0808,584,40019,I4210157861


In [23]:
author_df.to_csv("./transform_csv_data/author.csv", index=False, encoding='utf-8')