In [1]:
import os
import json
import pandas as pd
from rapidfuzz import process, fuzz

In [2]:
data_folder = "raw_data"
core_csv = "CORE.csv"
sjr_csv = "scimagojr 2024.csv"

In [3]:
core_df = pd.read_csv(core_csv)
sjr_df = pd.read_csv(sjr_csv, sep=';')

In [4]:
years = list(range(2017, 2024+1))
years

[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

In [5]:
#column name of core_df
core_df.columns

Index(['id', 'Title', 'Acronym', 'Source', 'Rank', 'Note', 'Primarily FoR',
       'Unknown1', 'Unknown2'],
      dtype='object')

In [6]:
core_df["Title"] = core_df["Title"].str.strip().str.lower()
sjr_df["Title"] = sjr_df["Title"].str.strip().str.lower()

In [7]:
core_df["Title"] = core_df["Title"].astype(str).str.strip().str.lower()
sjr_df["Title"] = sjr_df["Title"].astype(str).str.strip().str.lower()

In [8]:
sjr_df["SJR Best Quartile"].value_counts()

SJR Best Quartile
Q1    9160
Q2    7707
Q3    6928
Q4    6406
-      935
Name: count, dtype: int64

In [9]:
sjr_df["SJR Best Quartile"] = sjr_df["SJR Best Quartile"].replace("-", None)

In [10]:
valid_ranks = ["C", "B", "A", "A*"]

core_df["Rank"] = core_df["Rank"].apply(
    lambda x: x if x in valid_ranks else None
)

In [11]:
print(core_df["Rank"].value_counts())

Rank
C     362
B     221
A     117
A*     60
Name: count, dtype: int64


In [12]:
def find_ranking(venue_name, venue_type, threshold=80):
    """Tìm ranking bằng RapidFuzz dựa vào type và name"""
    venue_name = str(venue_name).strip().lower()

    if venue_type == "conference":
        choices = core_df["Title"].tolist()
        best_match = process.extractOne(venue_name, choices, scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= threshold:
            match_title = best_match[0]
            rank = core_df.loc[core_df["Title"] == match_title, "Rank"].values[0]
            return rank, best_match[1], match_title
    elif venue_type == "journal":
        choices = sjr_df["Title"].tolist()
        best_match = process.extractOne(venue_name, choices, scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= threshold:
            match_title = best_match[0]
            rank = sjr_df.loc[sjr_df["Title"] == match_title, "SJR Best Quartile"].values[0]
            return rank, best_match[1], match_title

    return None, None, None

In [13]:
# === Duyệt qua tất cả file JSON trong thư mục data/ ===
# for year in years:
data_folder_year = "data"
for filename in os.listdir(data_folder_year):
    # print(filename)
    if not filename.endswith(".json"):
        continue
    if filename in ("processed.json", "processing.json"):
        continue


    file_path = os.path.join(data_folder_year, filename)
    # print(f"Processing file: {file_path}")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    venue = data.get("venue", {})
    if venue is None:
        data["venue"] = {}
        data["venue"] = {"name": None, "type": None, "ranking": None}
        continue
    name = venue.get("name")
    vtype = venue.get("type")

    if name and vtype:
        rank, score, matched_title = find_ranking(name, vtype)

        if rank:
            data["venue"]["ranking"] = rank
            print(f"✅ {filename}: [{vtype}] '{name}' → '{matched_title}' | Rank: {rank} (Score={score:.1f})")
        else:
            print(f"⚠️ {filename}: Không tìm thấy match phù hợp cho '{name}'",end=" ")
            if score:
                print(f"(Score={score:.1f})")
            else:
                print()
    else:
        # if not exist venue info set null for all

        # print(filename)
        data["venue"]["ranking"] = None
        

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

print(f"\n🎯 Hoàn tất cập nhật ranking cho các file JSON.")

✅ 1912.11745.json: [journal] 'IEEE Transactions on Parallel and Distributed Systems' → 'ieee transactions on parallel and distributed systems' | Rank: Q1 (Score=100.0)
✅ 1712.02250.json: [conference] 'IEEE International Conference on Acoustics, Speech, and Signal Processing' → 'ieee international conference on acoustics, speech and signal processing' | Rank: B (Score=99.3)
⚠️ 1912.12673.json: Không tìm thấy match phù hợp cho 'SoutheastCon' 
⚠️ 1712.07558.json: Không tìm thấy match phù hợp cho 'Neural Information Processing Systems' 
✅ 1912.12101.json: [conference] 'IEEE International Conference on Robotics and Automation' → 'ieee international conference on robotics and automation' | Rank: A* (Score=100.0)
⚠️ 1912.11494.json: Không tìm thấy match phù hợp cho 'IEEE International Symposium on Biomedical Imaging' 
✅ 1712.04228.json: [journal] 'Information Processing Letters' → 'information processing letters' | Rank: Q3 (Score=100.0)
✅ 1712.08900.json: [journal] 'Computer Vision and Image