In [1]:
import os
import json
import pandas as pd
from rapidfuzz import process, fuzz

In [2]:
data_folder = "raw_data"
core_csv = "CORE.csv"
sjr_csv = "scimagojr 2024.csv"

In [3]:
core_df = pd.read_csv(core_csv)
sjr_df = pd.read_csv(sjr_csv, sep=';')

In [4]:
years = list(range(2017, 2024+1))
years

[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

In [5]:
#column name of core_df
core_df.columns

Index(['id', 'Title', 'Acronym', 'Source', 'Rank', 'Note', 'Primarily FoR',
       'Unknown1', 'Unknown2'],
      dtype='object')

In [6]:
core_df["Title"] = core_df["Title"].str.strip().str.lower()
sjr_df["Title"] = sjr_df["Title"].str.strip().str.lower()

In [7]:
core_df["Title"] = core_df["Title"].astype(str).str.strip().str.lower()
sjr_df["Title"] = sjr_df["Title"].astype(str).str.strip().str.lower()

In [8]:
sjr_df["SJR Best Quartile"].value_counts()

SJR Best Quartile
Q1    9160
Q2    7707
Q3    6928
Q4    6406
-      935
Name: count, dtype: int64

In [9]:
sjr_df["SJR Best Quartile"] = sjr_df["SJR Best Quartile"].replace("-", None)

In [10]:
valid_ranks = ["C", "B", "A", "A*"]

core_df["Rank"] = core_df["Rank"].apply(
    lambda x: x if x in valid_ranks else None
)

In [11]:
print(core_df["Rank"].value_counts())

Rank
C     362
B     221
A     117
A*     60
Name: count, dtype: int64


In [12]:
def find_ranking(venue_name, venue_type, threshold=80):
    """Tìm ranking bằng RapidFuzz dựa vào type và name"""
    venue_name = str(venue_name).strip().lower()

    if venue_type == "conference":
        choices = core_df["Title"].tolist()
        best_match = process.extractOne(venue_name, choices, scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= threshold:
            match_title = best_match[0]
            rank = core_df.loc[core_df["Title"] == match_title, "Rank"].values[0]
            return rank, best_match[1], match_title
    elif venue_type == "journal":
        choices = sjr_df["Title"].tolist()
        best_match = process.extractOne(venue_name, choices, scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] >= threshold:
            match_title = best_match[0]
            rank = sjr_df.loc[sjr_df["Title"] == match_title, "SJR Best Quartile"].values[0]
            return rank, best_match[1], match_title

    return None, None, None

In [14]:
# === Duyệt qua tất cả file JSON trong thư mục data/ ===
count_not_found = 0
for year in range(2012, 2025):
    data_folder_year = f"data/{year}"
    for filename in os.listdir(data_folder_year):
        file_path = os.path.join(data_folder_year, filename)
        
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        venue = data.get("venue", {})
        if venue is None:
            data["venue"] = {}
            data["venue"] = {"name": None, "type": None, "ranking": None}
            continue
        name = venue.get("name")
        vtype = venue.get("type")

        if name and vtype:
            rank, score, matched_title = find_ranking(name, vtype)

            if rank:
                data["venue"]["ranking"] = rank
                print(f"✅ {filename}: [{vtype}] '{name}' → '{matched_title}' | Rank: {rank} (Score={score:.1f})")
            else:
                print(f"⚠️ {filename}: Không tìm thấy match phù hợp cho '{name}'",end=" ")
                count_not_found += 1
                if score:
                    print(f"(Score={score:.1f})")
                else:
                    print()
        else:
            print(f"⚠️ {filename}: Khong ton tai loai hinh cong bo")
            count_not_found += 1

        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"\n🎯 Hoàn tất cập nhật ranking cho các file JSON.")
print(count_not_found)

✅ 1212.2518.json: [conference] 'Conference on Uncertainty in Artificial Intelligence' → 'conference in uncertainty in artificial intelligence' | Rank: A (Score=94.2)
✅ 1212.2511.json: [conference] 'Conference on Uncertainty in Artificial Intelligence' → 'conference in uncertainty in artificial intelligence' | Rank: A (Score=94.2)
⚠️ 1211.5625.json: Khong ton tai loai hinh cong bo
✅ 1212.2480.json: [conference] 'Conference on Uncertainty in Artificial Intelligence' → 'conference in uncertainty in artificial intelligence' | Rank: A (Score=94.2)
⚠️ 1212.4674.json: Khong ton tai loai hinh cong bo
⚠️ 1212.3719.json: Khong ton tai loai hinh cong bo
✅ 1212.2500.json: [conference] 'Conference on Uncertainty in Artificial Intelligence' → 'conference in uncertainty in artificial intelligence' | Rank: A (Score=94.2)
✅ 1211.6566.json: [journal] 'IEEE Transactions on Wireless Communications' → 'ieee transactions on wireless communications' | Rank: Q1 (Score=100.0)
⚠️ 1212.1909.json: Khong ton tai l