# Description

In this notebook, I will read and explore the dataset from Semantic Scholar data

In [1]:
import os 
import time
import gc
import requests
import json
from tqdm import tqdm
import gzip
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse

# 1. Read Venue information

In [None]:
PATH_FOLDER_DATA = '/home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data'
PATH_FILE_VENUE_INFO = os.path.join(PATH_FOLDER_DATA, '20250822_121946_00119_nihdq_f267a37e-fb99-4619-bf9c-ac33bb4ff585.gz')

In [None]:
# Load gz file
with gzip.open(PATH_FILE_VENUE_INFO, 'rt', encoding='utf-8') as f:
    venue_data = [json.loads(line) for line in f]
print(f"Number of venues: {len(venue_data)}")

In [None]:
# Convert the list of dict to DataFrame
venue_df = pd.DataFrame(venue_data)
print(f"Shape of venue DataFrame: {venue_df.shape}")
venue_df.head(1)

In [None]:
STOPWORDS = {"the","a","an","of","on","in","for","and","to","at","is","are","about","with","by","from","as","into","via"}

def normalize_venue(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = s.replace("&", " and ")
    s = re.sub(r"\(.*?\)", " ", s)          # drop parenthetical parts
    s = re.sub(r"\b\d{4}\b", " ", s)        # drop years like 2025
    s = re.sub(r"[^a-z0-9\s]", " ", s)      # drop punctuation
    tokens = re.findall(r"[a-z0-9]+", s)

    # drop generic words and venue boilerplate
    cleaned = [t for t in tokens if t not in STOPWORDS]

    # canonical form: sort token set to ignore order/duplicates
    return " ".join(sorted(set(cleaned)))

In [None]:
# name_journal = 'Computer Vision and Pattern Recognition'.lower()
name_journal = 'International Conference on Machine Learning'
name_journal = normalize_venue(name_journal)

venue_df[venue_df['name'].apply(lambda x: normalize_venue(x) == name_journal)]

In [None]:
# Save venue data to CSV
venue_df.to_parquet(os.path.join(PATH_FOLDER_DATA, 'venue_info.parquet'), index=False)

# 2. Read paper info

## 2.1. Load paper from raw file and save to `parquet`

In [24]:
PATH_FOLDER_DATA = '/home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data'

PATH_FILE_PAPER_DATA = os.path.join(PATH_FOLDER_DATA, '20250822_070444_00023_vs6f6_06af69c5-002b-4503-9557-233755e9ad9b.gz')

PATH_FOLDER_DATA_PROCESSED = '/home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data_processed'

LIST_SELECTED_FIELDS = ['title', 'authors', 'year', 'citationcount',\
                        'venue', 'publicationvenueid', 'publicationdate']

In [42]:
list_file_downloaded = os.listdir(PATH_FOLDER_DATA)
list_file_downloaded = sorted(list_file_downloaded)
print(f"Number of downloaded files: {len(list_file_downloaded)}")

Number of downloaded files: 60


In [None]:
CHUNK_SIZE = 500_000

for idx_file, file_name in enumerate(list_file_downloaded):
    if not file_name.endswith('.gz'):
        continue
    path_file = os.path.join(PATH_FOLDER_DATA, file_name)
    print(f"{idx_file}. Processing file: {path_file}")

    rows = []
    try:
        with gzip.open(path_file, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                try:
                    line_json = json.loads(line)
                
                    # Extract paper ID from semantic url
                    url = line_json.get("url", "")
                    paper_id = urlparse(url).path.split("/")[-1] if url else ""
                    
                    row = {k: line_json.get(k, "") for k in LIST_SELECTED_FIELDS}
                    row["semantic_id"] = paper_id
                    rows.append(row)

                    if (i > 0) and (i % CHUNK_SIZE == 0):
                        output_file_parquet_name = f"paper_file_{idx_file}_chunk_{i//CHUNK_SIZE}.parquet"
                        path_output_file_parquet = os.path.join(PATH_FOLDER_DATA_PROCESSED, output_file_parquet_name)
                        pd.DataFrame(rows).to_parquet(path_output_file_parquet)
                        rows.clear()
                        time.sleep(3)  # avoid overwhelming the system
                        gc.collect() 
                except:
                    continue
                
        # Save any remaining rows
        if (rows) and (len(rows) > 0):
            output_file_parquet_name = f"paper_file_{idx_file}_chunk_final.parquet"
            path_output_file_parquet = os.path.join(PATH_FOLDER_DATA_PROCESSED, output_file_parquet_name)
            pd.DataFrame(rows).to_parquet(path_output_file_parquet)
            rows.clear()
            time.sleep(3)
            gc.collect()

        print(f"Finished processing file: {path_file} \n")
    except (gzip.BadGzipFile, OSError) as e:
        print(f"[fatal] Cannot read {path_file}: {e}. Skipping file.")
        continue

0. Processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data/20250822_070444_00023_vs6f6_06af69c5-002b-4503-9557-233755e9ad9b.gz
Finished processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data/20250822_070444_00023_vs6f6_06af69c5-002b-4503-9557-233755e9ad9b.gz 

1. Processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data/20250822_070444_00023_vs6f6_06fa50bb-9d50-41ca-b2bc-c1fcacfd4785.gz
Finished processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data/20250822_070444_00023_vs6f6_06fa50bb-9d50-41ca-b2bc-c1fcacfd4785.gz 

2. Processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_scholar_data/20250822_070444_00023_vs6f6_07ac81a4-bf8d-4d9e-9c53-78687adb27a7.gz
Finished processing file: /home/thaiv7/Desktop/python-project/scientific_paper/dataset/data/semantic_sch

## 2.2. Load paper and query additional data

We get reference of paper.

In [27]:
path_file_parquet = os.path.join(PATH_FOLDER_DATA_PROCESSED, 'papers_part_1.parquet')
df = pd.read_parquet(path_file_parquet)
print(f"DataFrame shape: {df.shape}")
df.head(2)

DataFrame shape: (500001, 8)


Unnamed: 0,title,authors,year,citationcount,venue,publicationvenueid,publicationdate,semantic_id
0,Refinement of the mental model of a solution b...,"[{'authorId': '2329932387', 'name': 'S. Chalyi...",2024.0,0,Management Information System and Devises,,2024-12-04,8b6591b716d5071785ebefd57aa14ef7a3dc495e
1,Late swing or early stance? A narrative review...,"[{'authorId': '1398183854', 'name': 'Claire Ke...",2019.0,90,Scandinavian Journal of Medicine & Science in ...,5712d901-2da7-4216-814d-2f143495e778,2019-08-01,954fdea23e1970fb0b20a6c9343a0e2179ea4e83


In [33]:
def get_references(paper_id, fields="paperId", limit=100, timeout=30):
    """
    Fetch reference paperIds for a given Semantic Scholar paper_id.

    Args:
        paper_id (str): Semantic Scholar Paper ID, DOI, ArXiv ID, etc.
        fields (str): Fields to request (default: "paperId").
        limit (int): Max refs per call (API limit = 100).
        timeout (int): Request timeout in seconds.

    Returns:
        list[str]: List of cited paperIds (could be empty).
    """
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references"
    params = {"fields": fields, "limit": limit}

    try:
        resp = requests.get(url, params=params, timeout=timeout)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error for paper {paper_id}: {e}")
        return []
    except requests.exceptions.RequestException as e:
        print(f"Network error for paper {paper_id}: {e}")
        return []

    try:
        data = resp.json()
    except ValueError:
        print(f"Invalid JSON for paper {paper_id}")
        return []

    refs = data.get("data", [])
    if not refs:
        return []
    paper_ids = [r.get("citedPaper", {}).get("paperId") for r in refs if r.get("citedPaper")]
    return [pid for pid in paper_ids if pid]

In [34]:
list_ref_paper = get_references("954fdea23e1970fb0b20a6c9343a0e2179ea4e83")
list_ref_paper

[]