In [6]:
import json
import pandas as pd
import os

# File paths
journal_info_path = r'C:\Users\ningji\Desktop\ai_policy\ai_policy\data\journalInfo.json'
doaj_path = r'C:\Users\ningji\Desktop\ai_policy\ai_policy\data\journalcsv__doaj_20250808_0928_utf8.csv'

# Read journal info file
print("Loading journal info data...")
with open(journal_info_path, 'r', encoding='utf-8') as f:
    journal_info = json.load(f)

# Read DOAJ data
print("Loading DOAJ data...")
doaj_df = pd.read_csv(doaj_path, encoding='utf-8')

# Extract journal titles from DOAJ for method 3
doaj_titles = set(doaj_df['Journal ISSN (print version)'].str.lower().tolist())

# Method 1: Any journal with "% of Citable OA" > 0% is OA
def method1(journals):
    oa_count = 0
    non_oa_count = 0
    
    for journal in journals:
        oa_percentage = journal.get("% of Citable OA", "0%")
        # Remove the % sign and convert to float
        try:
            oa_percentage = float(oa_percentage.strip('%'))
        except ValueError:
            oa_percentage = 0.0
        
        if oa_percentage > 0:
            oa_count += 1
        else:
            non_oa_count += 1
    
    return oa_count, non_oa_count

# Method 2: Only journals with "% of Citable OA" = 100% are OA
def method2(journals):
    oa_count = 0
    non_oa_count = 0
    
    for journal in journals:
        oa_percentage = journal.get("% of Citable OA", "0%")
        try:
            oa_percentage = float(oa_percentage.strip('%'))
        except ValueError:
            oa_percentage = 0.0
        
        if oa_percentage == 100:
            oa_count += 1
        else:
            non_oa_count += 1
    
    return oa_count, non_oa_count

# Method 3: Journals in the DOAJ dataset are OA
def method3(journals):
    oa_count = 0
    non_oa_count = 0
    
    for journal in journals:
        journal_title = journal.get("issn", "").lower()
        
        if journal_title in doaj_titles:
            oa_count += 1
        else:
            non_oa_count += 1
    
    return oa_count, non_oa_count

# Run all methods and print results
print("\n--- Method 1: Any journal with OA percentage > 0% is OA ---")
oa_count1, non_oa_count1 = method1(journal_info)
print(f"OA journals: {oa_count1}")
print(f"Non-OA journals: {non_oa_count1}")
print(f"Total: {oa_count1 + non_oa_count1}")
print(f"OA percentage: {oa_count1 / (oa_count1 + non_oa_count1) * 100:.2f}%")

print("\n--- Method 2: Only journals with 100% OA are OA ---")
oa_count2, non_oa_count2 = method2(journal_info)
print(f"OA journals: {oa_count2}")
print(f"Non-OA journals: {non_oa_count2}")
print(f"Total: {oa_count2 + non_oa_count2}")
print(f"OA percentage: {oa_count2 / (oa_count2 + non_oa_count2) * 100:.2f}%")

print("\n--- Method 3: Journals in the DOAJ dataset are OA ---")
oa_count3, non_oa_count3 = method3(journal_info)
print(f"OA journals: {oa_count3}")
print(f"Non-OA journals: {non_oa_count3}")
print(f"Total: {oa_count3 + non_oa_count3}")
print(f"OA percentage: {oa_count3 / (oa_count3 + non_oa_count3) * 100:.2f}%")

Loading journal info data...
Loading DOAJ data...

--- Method 1: Any journal with OA percentage > 0% is OA ---
OA journals: 4888
Non-OA journals: 226
Total: 5114
OA percentage: 95.58%

--- Method 2: Only journals with 100% OA are OA ---
OA journals: 252
Non-OA journals: 4862
Total: 5114
OA percentage: 4.93%

--- Method 3: Journals in the DOAJ dataset are OA ---
OA journals: 417
Non-OA journals: 4697
Total: 5114
OA percentage: 8.15%
