In [1]:
import json
import pandas as pd

with open('./DATA/all_data.json') as file:
    data = json.load(file)

In [2]:
core_rows = []

for record in data:
    try:
        core = record['abstracts-retrieval-response']['coredata']
        core_rows.append({
            'Eid': core['eid'],
            'Title': core.get('dc:title', None),
            'Publish_year': core.get('prism:coverDate', '').split('-')[0],
            'Language': record['abstracts-retrieval-response'].get('language', {}).get('@xml:lang', None),
            'Cited_by_count': core.get('citedby-count', 0)
        })
    except Exception as e:
        pass
    
core_df = pd.DataFrame(core_rows)


In [3]:
author_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        authors = record['abstracts-retrieval-response']['authors']['author']
        for author in authors:
            author_rows.append({
                'Eid': eid,
                'Author_name': author.get('preferred-name', {}).get('ce:indexed-name', None),
                'Affiliations': author.get('affiliation', {}).get('affilname', None)
            })
    except Exception as e:
        pass

authors_df = pd.DataFrame(author_rows)


In [4]:
subject_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        subjects = record['abstracts-retrieval-response']['subject-areas']['subject-area']
        for subject in subjects:
            subject_rows.append({
                'Eid': eid,
                'Subject_areas': subject.get('$', None),
                'Subject_codes': subject.get('@code', None)
            })
    except Exception as e:
        pass

subjects_df = pd.DataFrame(subject_rows)


In [5]:
keyword_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        keywords = record['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']['author-keywords']['author-keyword']
        for keyword in keywords:
            keyword_rows.append({
                'Eid': eid,
                'Keywords': keyword.get('$', None),
                'Keyword_language': keyword.get('@xml:lang', None)
            })
    except Exception as e:
        pass

keywords_df = pd.DataFrame(keyword_rows)


In [6]:
core_df.head()

Unnamed: 0,Eid,Title,Publish_year,Language,Cited_by_count
0,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8
1,2-s2.0-85049101440,The critical factors of research and innovatio...,2018,eng,33
2,2-s2.0-85054140369,Is the occiput-wall distance valid and reliabl...,2018,eng,8
3,2-s2.0-85097515350,Comparison of soil composition between farmlan...,2018,eng,0
4,2-s2.0-85041527766,The impact of wire caliber on ERCP outcomes: a...,2018,eng,12


In [7]:
authors_df.head()

Unnamed: 0,Eid,Author_name,Affiliations
0,2-s2.0-85053164279,Gao X.,
1,2-s2.0-85053164279,Xia C.,
2,2-s2.0-85053164279,Zhang X.,
3,2-s2.0-85053164279,Jing Q.,
4,2-s2.0-85053164279,Ma M.,


In [8]:
subjects_df.head()

Unnamed: 0,Eid,Subject_areas,Subject_codes
0,2-s2.0-85053164279,Materials Science (all),2500
1,2-s2.0-85053164279,Condensed Matter Physics,3104
2,2-s2.0-85049101440,Business and International Management,1403
3,2-s2.0-85049101440,"Economics, Econometrics and Finance (all)",2000
4,2-s2.0-85054140369,"Physical Therapy, Sports Therapy and Rehabilit...",3612


In [9]:
keywords_df.head()

Unnamed: 0,Eid,Keywords,Keyword_language
0,2-s2.0-85053164279,EIS,eng
1,2-s2.0-85053164279,Microstructure,eng
2,2-s2.0-85053164279,Pitting corrosion,eng
3,2-s2.0-85053164279,Polarization,eng
4,2-s2.0-85053164279,Titanium alloy,eng


In [10]:
final_df = core_df.merge(authors_df, on='Eid', how='left') \
                  .merge(subjects_df, on='Eid', how='left') \
                  .merge(keywords_df, on='Eid', how='left')


In [11]:
final_df.head()

Unnamed: 0,Eid,Title,Publish_year,Language,Cited_by_count,Author_name,Affiliations,Subject_areas,Subject_codes,Keywords,Keyword_language
0,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8,Gao X.,,Materials Science (all),2500,EIS,eng
1,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8,Gao X.,,Materials Science (all),2500,Microstructure,eng
2,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8,Gao X.,,Materials Science (all),2500,Pitting corrosion,eng
3,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8,Gao X.,,Materials Science (all),2500,Polarization,eng
4,2-s2.0-85053164279,Effects of iron content on the microstructure ...,2018,eng,8,Gao X.,,Materials Science (all),2500,Titanium alloy,eng


In [12]:
final_df.to_csv('scopus_data.csv', index=False)

In [13]:
print("จำนวนรายการในแต่ละ DataFrame:")
print(f"Core Data: {len(core_df)}")
print(f"Authors Data: {len(authors_df)}")
print(f"Subjects Data: {len(subjects_df)}")
print(f"Keywords Data: {len(keywords_df)}")

จำนวนรายการในแต่ละ DataFrame:
Core Data: 20096
Authors Data: 121711
Subjects Data: 50064
Keywords Data: 83032


In [14]:
print("\nตรวจสอบข้อมูลว่างในแต่ละ DataFrame:")
print("Missing values in core_df:\n", core_df.isnull().sum())
print("Missing values in authors_df:\n", authors_df.isnull().sum())
print("Missing values in subjects_df:\n", subjects_df.isnull().sum())
print("Missing values in keywords_df:\n", keywords_df.isnull().sum())


ตรวจสอบข้อมูลว่างในแต่ละ DataFrame:
Missing values in core_df:
 Eid               0
Title             0
Publish_year      0
Language          0
Cited_by_count    4
dtype: int64
Missing values in authors_df:
 Eid                  0
Author_name          0
Affiliations    121711
dtype: int64
Missing values in subjects_df:
 Eid              0
Subject_areas    0
Subject_codes    0
dtype: int64
Missing values in keywords_df:
 Eid                 0
Keywords            0
Keyword_language    0
dtype: int64


In [15]:
print("\nการกระจายตัวของปีที่ตีพิมพ์:")
publish_year_distribution = core_df['Publish_year'].value_counts().sort_index()
print(publish_year_distribution)


การกระจายตัวของปีที่ตีพิมพ์:
Publish_year
2018    2784
2019    3063
2020    3373
2021    3787
2022    4227
2023    2862
Name: count, dtype: int64


In [16]:
print("\nข้อมูลการถูกอ้างอิง:")
cited_by_summary = core_df['Cited_by_count'].describe()
print(cited_by_summary)


ข้อมูลการถูกอ้างอิง:
count     20092
unique      211
top           0
freq       5251
Name: Cited_by_count, dtype: object


In [17]:
print("\nจำนวนผู้แต่งที่ไม่ซ้ำกัน:")
unique_authors = authors_df['Author_name'].nunique()
print(f"มีผู้แต่งที่ไม่ซ้ำกันทั้งหมด: {unique_authors}")


จำนวนผู้แต่งที่ไม่ซ้ำกัน:
มีผู้แต่งที่ไม่ซ้ำกันทั้งหมด: 30819


In [18]:
print("\nการวิเคราะห์คีย์เวิร์ด:")
keyword_counts = keywords_df['Keywords'].value_counts().head(10)
print("Top 10 คีย์เวิร์ดที่พบบ่อย:")
print(keyword_counts)


การวิเคราะห์คีย์เวิร์ด:
Top 10 คีย์เวิร์ดที่พบบ่อย:
Keywords
Thailand                                  746
COVID-19                                  282
Hadron-Hadron scattering (experiments)    152
Inflammation                              130
SARS-CoV-2                                115
HIV                                       114
CMS                                        98
Oxidative stress                           83
Asia                                       81
Machine learning                           78
Name: count, dtype: int64


In [19]:
print("\nความหลากหลายของภาษาใน Keywords:")
keyword_language_counts = keywords_df['Keyword_language'].value_counts()
print(keyword_language_counts)


ความหลากหลายของภาษาใน Keywords:
Keyword_language
eng    83024
tha        5
bos        3
Name: count, dtype: int64


In [20]:
print("\nการวิเคราะห์สาขาวิชา:")
subject_areas_counts = subjects_df['Subject_areas'].value_counts()
print("Top สาขาวิชาที่พบบ่อย:")
print(subject_areas_counts.head(10))


การวิเคราะห์สาขาวิชา:
Top สาขาวิชาที่พบบ่อย:
Subject_areas
Multidisciplinary                                       1088
Materials Science (all)                                  907
Chemistry (all)                                          905
Chemical Engineering (all)                               755
Infectious Diseases                                      753
Computer Science Applications                            725
Electrical and Electronic Engineering                    705
Computer Networks and Communications                     699
Medicine (all)                                           680
Public Health, Environmental and Occupational Health     664
Name: count, dtype: int64
