In [74]:
import json
import pandas as pd

with open('./DATA/random_data.json') as file:
    data = json.load(file)

In [75]:
core_rows = []

for record in data:
    try:
        core = record['abstracts-retrieval-response']['coredata']
        core_rows.append({
            'Eid': core['eid'],
            'Title': core.get('dc:title', None),
            'Publish_year': core.get('prism:coverDate', '').split('-')[0],
            'Language': record['abstracts-retrieval-response'].get('language', {}).get('@xml:lang', None),
            'Cited_by_count': core.get('citedby-count', 0)
        })
    except Exception as e:
        pass
    
core_df = pd.DataFrame(core_rows)


In [76]:
author_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        authors = record['abstracts-retrieval-response']['authors']['author']
        for author in authors:
            author_rows.append({
                'Eid': eid,
                'Author_name': author.get('preferred-name', {}).get('ce:indexed-name', None),
                'Affiliations': author.get('affiliation', {}).get('affilname', None)
            })
    except Exception as e:
        pass

authors_df = pd.DataFrame(author_rows)


In [77]:
subject_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        subjects = record['abstracts-retrieval-response']['subject-areas']['subject-area']
        for subject in subjects:
            subject_rows.append({
                'Eid': eid,
                'Subject_areas': subject.get('$', None),
                'Subject_codes': subject.get('@code', None)
            })
    except Exception as e:
        pass

subjects_df = pd.DataFrame(subject_rows)


In [78]:
keyword_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        keywords = record['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']['author-keywords']['author-keyword']
        for keyword in keywords:
            keyword_rows.append({
                'Eid': eid,
                'Keywords': keyword.get('$', None),
                'Keyword_language': keyword.get('@xml:lang', None)
            })
    except Exception as e:
        pass

keywords_df = pd.DataFrame(keyword_rows)


In [79]:
core_df.head()

Unnamed: 0,Eid,Title,Publish_year,Language,Cited_by_count
0,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8
1,2-s2.0-85051180271,Nephrology nurses' perceptions of discussing s...,2018,eng,8
2,2-s2.0-85056255685,Tire-suspension-steering hardware-in-the-loop ...,2018,eng,2
3,2-s2.0-85042708303,Elevated HPV16 E1 Expression Is Associated wit...,2018,eng,14
4,2-s2.0-85040707959,Characteristics of pericytes in diethylstilbes...,2018,eng,8


In [80]:
authors_df.head()

Unnamed: 0,Eid,Author_name,Affiliations
0,2-s2.0-85048179306,Ali F.,
1,2-s2.0-85048179306,Akbar A.,
2,2-s2.0-85048179306,Prasongsuk S.,
3,2-s2.0-85048179306,Permpornsakul P.,
4,2-s2.0-85048179306,Yanwisetpakdee B.,


In [81]:
subjects_df.head()

Unnamed: 0,Eid,Subject_areas,Subject_codes
0,2-s2.0-85048179306,Plant Science,1110
1,2-s2.0-85051180271,Nephrology,2727
2,2-s2.0-85051180271,Advanced and Specialized Nursing,2902
3,2-s2.0-85056255685,Engineering (all),2200
4,2-s2.0-85042708303,Virology,2406


In [82]:
keywords_df.head()

Unnamed: 0,Eid,Keywords,Keyword_language
0,2-s2.0-85048179306,Extremophiles,eng
1,2-s2.0-85048179306,Halophilic Fungi,eng
2,2-s2.0-85048179306,Hypersaline habitats,eng
3,2-s2.0-85048179306,Penicillium imranianum,eng
4,2-s2.0-85051180271,End stage kidney disease,eng


In [83]:
final_df = core_df.merge(authors_df, on='Eid', how='left') \
                  .merge(subjects_df, on='Eid', how='left') \
                  .merge(keywords_df, on='Eid', how='left')


In [84]:
final_df.head()

Unnamed: 0,Eid,Title,Publish_year,Language,Cited_by_count,Author_name,Affiliations,Subject_areas,Subject_codes,Keywords,Keyword_language
0,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8,Ali F.,,Plant Science,1110,Extremophiles,eng
1,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8,Ali F.,,Plant Science,1110,Halophilic Fungi,eng
2,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8,Ali F.,,Plant Science,1110,Hypersaline habitats,eng
3,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8,Ali F.,,Plant Science,1110,Penicillium imranianum,eng
4,2-s2.0-85048179306,"Penicillium Imranianum, a new species from the...",2018,eng,8,Akbar A.,,Plant Science,1110,Extremophiles,eng


In [85]:
final_df.to_csv('scopus_data.csv', index=False)

In [86]:
print("จำนวนรายการในแต่ละ DataFrame:")
print(f"Core Data: {len(core_df)}")
print(f"Authors Data: {len(authors_df)}")
print(f"Subjects Data: {len(subjects_df)}")
print(f"Keywords Data: {len(keywords_df)}")

จำนวนรายการในแต่ละ DataFrame:
Core Data: 200
Authors Data: 1360
Subjects Data: 476
Keywords Data: 804


In [87]:
print("\nตรวจสอบข้อมูลว่างในแต่ละ DataFrame:")
print("Missing values in core_df:\n", core_df.isnull().sum())
print("Missing values in authors_df:\n", authors_df.isnull().sum())
print("Missing values in subjects_df:\n", subjects_df.isnull().sum())
print("Missing values in keywords_df:\n", keywords_df.isnull().sum())


ตรวจสอบข้อมูลว่างในแต่ละ DataFrame:
Missing values in core_df:
 Eid               0
Title             0
Publish_year      0
Language          0
Cited_by_count    0
dtype: int64
Missing values in authors_df:
 Eid                0
Author_name        0
Affiliations    1360
dtype: int64
Missing values in subjects_df:
 Eid              0
Subject_areas    0
Subject_codes    0
dtype: int64
Missing values in keywords_df:
 Eid                 0
Keywords            0
Keyword_language    0
dtype: int64


In [88]:
print("\nการกระจายตัวของปีที่ตีพิมพ์:")
publish_year_distribution = core_df['Publish_year'].value_counts().sort_index()
print(publish_year_distribution)


การกระจายตัวของปีที่ตีพิมพ์:
Publish_year
2018    34
2019    27
2020    39
2021    29
2022    48
2023    23
Name: count, dtype: int64


In [89]:
print("\nข้อมูลการถูกอ้างอิง:")
cited_by_summary = core_df['Cited_by_count'].describe()
print(cited_by_summary)


ข้อมูลการถูกอ้างอิง:
count     200
unique     36
top         0
freq       46
Name: Cited_by_count, dtype: object


In [90]:
print("\nจำนวนผู้แต่งที่ไม่ซ้ำกัน:")
unique_authors = authors_df['Author_name'].nunique()
print(f"มีผู้แต่งที่ไม่ซ้ำกันทั้งหมด: {unique_authors}")


จำนวนผู้แต่งที่ไม่ซ้ำกัน:
มีผู้แต่งที่ไม่ซ้ำกันทั้งหมด: 1296


In [91]:
print("\nการวิเคราะห์คีย์เวิร์ด:")
keyword_counts = keywords_df['Keywords'].value_counts().head(10)
print("Top 10 คีย์เวิร์ดที่พบบ่อย:")
print(keyword_counts)


การวิเคราะห์คีย์เวิร์ด:
Top 10 คีย์เวิร์ดที่พบบ่อย:
Keywords
Thailand                  7
HIV                       3
Mechanism                 2
antiretroviral therapy    2
Wound dressing            2
Bangkok                   2
Inflammation              2
Hydrogel                  2
COVID-19                  2
Oxidative stress          2
Name: count, dtype: int64


In [92]:
print("\nความหลากหลายของภาษาใน Keywords:")
keyword_language_counts = keywords_df['Keyword_language'].value_counts()
print(keyword_language_counts)


ความหลากหลายของภาษาใน Keywords:
Keyword_language
eng    804
Name: count, dtype: int64


In [93]:
print("\nการวิเคราะห์สาขาวิชา:")
subject_areas_counts = subjects_df['Subject_areas'].value_counts()
print("Top สาขาวิชาที่พบบ่อย:")
print(subject_areas_counts.head(10))


การวิเคราะห์สาขาวิชา:
Top สาขาวิชาที่พบบ่อย:
Subject_areas
Computer Science Applications           13
Chemistry (all)                         11
Multidisciplinary                       11
Engineering (all)                       10
Medicine (all)                           9
Chemical Engineering (all)               9
Polymers and Plastics                    8
Condensed Matter Physics                 8
Materials Science (all)                  8
Computer Networks and Communications     8
Name: count, dtype: int64
