In [None]:
import json
import pandas as pd

with open('./DATA/all_data.json') as file:
    data = json.load(file)

In [None]:
core_rows = []

for record in data:
    try:
        core = record['abstracts-retrieval-response']['coredata']
        core_rows.append({
            'Eid': core['eid'],
            'Title': core.get('dc:title', None),
            'Publish_year': core.get('prism:coverDate', '').split('-')[0],
            'Language': record['abstracts-retrieval-response'].get('language', {}).get('@xml:lang', None),
            'Cited_by_count': core.get('citedby-count', 0)
        })
    except Exception as e:
        pass
    
core_df = pd.DataFrame(core_rows)


In [None]:
author_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        authors = record['abstracts-retrieval-response']['authors']['author']
        for author in authors:
            author_rows.append({
                'Eid': eid,
                'Author_name': author.get('preferred-name', {}).get('ce:indexed-name', None),
                'Affiliations': author.get('affiliation', {}).get('affilname', None)
            })
    except Exception as e:
        pass

authors_df = pd.DataFrame(author_rows)


In [None]:
subject_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        subjects = record['abstracts-retrieval-response']['subject-areas']['subject-area']
        for subject in subjects:
            subject_rows.append({
                'Eid': eid,
                'Subject_areas': subject.get('$', None),
                'Subject_codes': subject.get('@code', None)
            })
    except Exception as e:
        pass

subjects_df = pd.DataFrame(subject_rows)


In [None]:
keyword_rows = []

for record in data:
    try:
        eid = record['abstracts-retrieval-response']['coredata']['eid']
        keywords = record['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-info']['author-keywords']['author-keyword']
        for keyword in keywords:
            keyword_rows.append({
                'Eid': eid,
                'Keywords': keyword.get('$', None),
                'Keyword_language': keyword.get('@xml:lang', None)
            })
    except Exception as e:
        pass

keywords_df = pd.DataFrame(keyword_rows)


In [None]:
core_df.head()

In [None]:
authors_df.head()

In [None]:
subjects_df.head()

In [None]:
keywords_df.head()

In [None]:
final_df = core_df.merge(authors_df, on='Eid', how='left') \
                  .merge(subjects_df, on='Eid', how='left') \
                  .merge(keywords_df, on='Eid', how='left')


In [None]:
final_df.head()

In [None]:
final_df.to_csv('scopus_data.csv', index=False)

In [None]:
print("จำนวนรายการในแต่ละ DataFrame:")
print(f"Core Data: {len(core_df)}")
print(f"Authors Data: {len(authors_df)}")
print(f"Subjects Data: {len(subjects_df)}")
print(f"Keywords Data: {len(keywords_df)}")

In [None]:
print("\nตรวจสอบข้อมูลว่างในแต่ละ DataFrame:")
print("Missing values in core_df:\n", core_df.isnull().sum())
print("Missing values in authors_df:\n", authors_df.isnull().sum())
print("Missing values in subjects_df:\n", subjects_df.isnull().sum())
print("Missing values in keywords_df:\n", keywords_df.isnull().sum())

In [None]:
print("\nการกระจายตัวของปีที่ตีพิมพ์:")
publish_year_distribution = core_df['Publish_year'].value_counts().sort_index()
print(publish_year_distribution)

In [None]:
print("\nข้อมูลการถูกอ้างอิง:")
cited_by_summary = core_df['Cited_by_count'].describe()
print(cited_by_summary)

In [None]:
print("\nจำนวนผู้แต่งที่ไม่ซ้ำกัน:")
unique_authors = authors_df['Author_name'].nunique()
print(f"มีผู้แต่งที่ไม่ซ้ำกันทั้งหมด: {unique_authors}")

In [None]:
print("\nการวิเคราะห์คีย์เวิร์ด:")
keyword_counts = keywords_df['Keywords'].value_counts().head(10)
print("Top 10 คีย์เวิร์ดที่พบบ่อย:")
print(keyword_counts)

In [None]:
print("\nความหลากหลายของภาษาใน Keywords:")
keyword_language_counts = keywords_df['Keyword_language'].value_counts()
print(keyword_language_counts)

In [None]:
print("\nการวิเคราะห์สาขาวิชา:")
subject_areas_counts = subjects_df['Subject_areas'].value_counts()
print("Top สาขาวิชาที่พบบ่อย:")
print(subject_areas_counts.head(10))