In [1]:
import sqlite3
import os
from collections import defaultdict

# --- Configuration ---
DATABASE_FILE = 'venues-iclr-2025-v2.db' # Replace with your actual database file name

# --- Helper Functions ---
def calculate_percentage(count, total):
    """Calculates percentage, handling division by zero."""
    if total == 0:
        return 0.0
    return (count / total) * 100

def format_output(label, absolute, percentage):
    """Formats the output string."""
    return f"{label}: {absolute} ({percentage:.2f}%)"

def is_affiliated(country):
    """
    Checks if an affiliation country is considered valid.
    A country is NOT valid if it is NULL, empty/whitespace, or 'UNK' (case-insensitive).
    Args:
        country (str or None): The affiliation country value from the database.
    Returns:
        bool: True if the country is considered valid, False otherwise.
    """
    if country is None:
        return False
    # Strip whitespace from the country string for comparison
    country_stripped = country.strip()
    if not country_stripped:  # Checks for empty string or whitespace only
        return False
    if country_stripped.upper() == 'UNK': # Case-insensitive check for 'UNK'
        return False
    # If none of the above conditions are met, it's considered affiliated
    return True

# --- Main Logic ---
def analyze_affiliations(db_path):
    """
    Connects to the database, fetches data for accepted papers,
    and calculates the requested statistics based on the refined affiliation definition.
    """
    if not os.path.exists(db_path):
        print(f"Error: Database file not found at {db_path}")
        return None

    results = {
        "total_accepted_papers": 0,
        "total_authors_on_accepted": 0,
        "authors_with_country": 0,
        "authors_without_country": 0,
        "papers_at_least_one_unaffiliated": 0,
        "papers_all_unaffiliated": 0,
        "papers_at_least_one_affiliated": 0,
        "papers_all_affiliated": 0,
    }

    conn = None
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Fetch paper_id, author_id, and affiliation_country for all authors
        # on papers with status 'accepted'.
        query = """
        SELECT
            pa.paper_id,
            pa.author_id,
            pa.affiliation_country
        FROM paper_authors AS pa
        JOIN papers AS p ON pa.paper_id = p.id
        WHERE LOWER(p.status) = 'accepted';
        """
        # Using LOWER() for case-insensitive matching of 'accepted'

        cursor.execute(query)
        rows = cursor.fetchall()

        if not rows:
            print("No accepted papers found in the database.")
            # Return default zero results, but indicate success in execution
            return results

        # --- Data Processing ---

        # Use sets to efficiently track unique authors and papers
        unique_accepted_paper_ids = set()
        unique_author_ids_on_accepted = set()
        # Set of author IDs who appeared at least once WITH a valid country
        authors_with_valid_country_set = set()

        # Use a dictionary to group authors by paper for paper-level stats
        # paper_data[paper_id] = list of (author_id, affiliation_country)
        paper_data = defaultdict(list)

        for paper_id, author_id, country in rows:
            unique_accepted_paper_ids.add(paper_id)
            unique_author_ids_on_accepted.add(author_id)
            paper_data[paper_id].append((author_id, country))

            # Use the updated is_affiliated function
            if is_affiliated(country):
                authors_with_valid_country_set.add(author_id)
            # No need for an 'else' set here, we calculate 'without' later

        # --- Calculate Author Statistics ---
        results["total_authors_on_accepted"] = len(unique_author_ids_on_accepted)

        # An author is counted "with country" if they appear AT LEAST ONCE with a valid country
        # on any accepted paper, according to the new is_affiliated logic.
        results["authors_with_country"] = len(authors_with_valid_country_set)

        # An author is counted "without country" if they are in the total set
        # but *never* appeared with a valid country (i.e., they are not in authors_with_valid_country_set).
        results["authors_without_country"] = results["total_authors_on_accepted"] - results["authors_with_country"]


        # --- Calculate Paper Statistics ---
        results["total_accepted_papers"] = len(unique_accepted_paper_ids)

        for paper_id, authors_list in paper_data.items():
            num_authors_on_paper = len(authors_list)
            num_affiliated_on_paper = 0
            num_unaffiliated_on_paper = 0

            if num_authors_on_paper == 0: # Safety check
                continue

            for _, country in authors_list:
                # Use the updated is_affiliated function here as well
                if is_affiliated(country):
                    num_affiliated_on_paper += 1
                else:
                    num_unaffiliated_on_paper += 1

            # Check paper conditions based on the counts per paper
            if num_unaffiliated_on_paper > 0:
                results["papers_at_least_one_unaffiliated"] += 1
            if num_unaffiliated_on_paper == num_authors_on_paper:
                results["papers_all_unaffiliated"] += 1
            if num_affiliated_on_paper > 0:
                results["papers_at_least_one_affiliated"] += 1
            if num_affiliated_on_paper == num_authors_on_paper:
                results["papers_all_affiliated"] += 1

        return results

    except sqlite3.Error as e:
        print(f"Database error: {e}")
        return None # Indicate failure
    except Exception as e:
        print(f"An error occurred: {e}")
        return None # Indicate failure
    finally:
        if conn:
            conn.close()

# --- Main Execution ---
if __name__ == "__main__":
    stats = analyze_affiliations(DATABASE_FILE)

    if stats is not None: # Check if analysis completed successfully
        total_authors = stats["total_authors_on_accepted"]
        total_papers = stats["total_accepted_papers"]

        print("--- Analysis Results (Based on Accepted Papers) ---")
        print(f"\nTotal Accepted Papers: {total_papers}")
        print(f"Total Unique Authors on Accepted Papers: {total_authors}")

        print("\n--- Author Affiliation (Country - excluding NULL, empty, 'UNK') ---")
        if total_authors > 0:
            print(format_output(
                "Authors with Affiliation Country (at least once)",
                stats["authors_with_country"],
                calculate_percentage(stats["authors_with_country"], total_authors)
            ))
            print(format_output(
                "Authors without Affiliation Country (only invalid/missing entries)",
                stats["authors_without_country"],
                calculate_percentage(stats["authors_without_country"], total_authors)
            ))
        else:
             print("No authors found on accepted papers to analyze.")


        print("\n--- Paper Affiliation Status (Country - excluding NULL, empty, 'UNK') ---")
        if total_papers > 0:
            print(format_output(
                "Papers with at least 1 Unaffiliated Author",
                stats["papers_at_least_one_unaffiliated"],
                calculate_percentage(stats["papers_at_least_one_unaffiliated"], total_papers)
            ))
            print(format_output(
                "Papers with ALL Authors Unaffiliated",
                stats["papers_all_unaffiliated"],
                calculate_percentage(stats["papers_all_unaffiliated"], total_papers)
            ))
            print(format_output(
                "Papers with at least 1 Affiliated Author",
                stats["papers_at_least_one_affiliated"],
                calculate_percentage(stats["papers_at_least_one_affiliated"], total_papers)
            ))
            print(format_output(
                "Papers with ALL Authors Affiliated",
                stats["papers_all_affiliated"],
                calculate_percentage(stats["papers_all_affiliated"], total_papers)
            ))
        else:
            print("No accepted papers found to analyze.")

    else:
        print("Analysis could not be completed due to errors.")

--- Analysis Results (Based on Accepted Papers) ---

Total Accepted Papers: 3705
Total Unique Authors on Accepted Papers: 15066

--- Author Affiliation (Country - excluding NULL, empty, 'UNK') ---
Authors with Affiliation Country (at least once): 14908 (98.95%)
Authors without Affiliation Country (only invalid/missing entries): 158 (1.05%)

--- Paper Affiliation Status (Country - excluding NULL, empty, 'UNK') ---
Papers with at least 1 Unaffiliated Author: 135 (3.64%)
Papers with ALL Authors Unaffiliated: 2 (0.05%)
Papers with at least 1 Affiliated Author: 3703 (99.95%)
Papers with ALL Authors Affiliated: 3570 (96.36%)


In [17]:
import sqlite3
import os
from collections import defaultdict

# --- Configuration ---
DATABASE_FILE = 'venues-neurips-2024-v2.db' # Replace with your actual database file name

# --- Helper Functions ---
def calculate_percentage(count, total):
    """Calculates percentage, handling division by zero."""
    if total == 0:
        return 0.0
    return (count / total) * 100

def format_output(label, absolute, percentage):
    """Formats the output string."""
    return f"{label}: {absolute} ({percentage:.2f}%)"

def is_affiliated(country):
    """
    Checks if an affiliation country is considered valid.
    A country is NOT valid if it is NULL, empty/whitespace, or 'UNK' (case-insensitive).
    Args:
        country (str or None): The affiliation country value from the database.
    Returns:
        bool: True if the country is considered valid, False otherwise.
    """
    if country is None:
        return False
    # Strip whitespace from the country string for comparison
    country_stripped = country.strip()
    if not country_stripped:  # Checks for empty string or whitespace only
        return False
    if country_stripped.upper() == 'UNK': # Case-insensitive check for 'UNK'
        return False
    # If none of the above conditions are met, it's considered affiliated
    return True

# --- Main Logic ---
def analyze_affiliations(db_path):
    """
    Connects to the database, fetches data for accepted papers,
    and calculates the requested statistics, including the list of papers
    where all authors are unaffiliated.
    """
    if not os.path.exists(db_path):
        print(f"Error: Database file not found at {db_path}")
        return None

    results = {
        "total_accepted_papers": 0,
        "total_authors_on_accepted": 0,
        "authors_with_country": 0,
        "authors_without_country": 0,
        "papers_at_least_one_unaffiliated": 0,
        "papers_all_unaffiliated": 0,
        "papers_at_least_one_affiliated": 0,
        "papers_all_affiliated": 0,
        "paper_ids_all_unaffiliated": [], # <-- New list to store paper IDs
        # Optional: Store titles if needed later
        # "paper_details_all_unaffiliated": [],
    }

    conn = None
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Fetch paper_id, author_id, affiliation_country, and paper title
        # for all authors on papers with status 'accepted'.
        # Added p.title to the query
        query = """
        SELECT
            pa.paper_id,
            p.title,
            pa.author_id,
            pa.affiliation_country
        FROM paper_authors AS pa
        JOIN papers AS p ON pa.paper_id = p.id
        WHERE LOWER(p.status) = 'accepted';
        """

        cursor.execute(query)
        rows = cursor.fetchall()

        if not rows:
            print("No accepted papers found in the database.")
            return results # Return default results

        # --- Data Processing ---
        unique_accepted_paper_ids = set()
        unique_author_ids_on_accepted = set()
        authors_with_valid_country_set = set()
        paper_titles = {} # Store paper titles keyed by paper_id

        # paper_data[paper_id] = list of (author_id, affiliation_country)
        paper_data = defaultdict(list)

        for paper_id, title, author_id, country in rows:
            unique_accepted_paper_ids.add(paper_id)
            unique_author_ids_on_accepted.add(author_id)
            paper_data[paper_id].append((author_id, country))
            if paper_id not in paper_titles: # Store title only once per paper
                 paper_titles[paper_id] = title

            if is_affiliated(country):
                authors_with_valid_country_set.add(author_id)

        # --- Calculate Author Statistics ---
        results["total_authors_on_accepted"] = len(unique_author_ids_on_accepted)
        results["authors_with_country"] = len(authors_with_valid_country_set)
        results["authors_without_country"] = results["total_authors_on_accepted"] - results["authors_with_country"]

        # --- Calculate Paper Statistics ---
        results["total_accepted_papers"] = len(unique_accepted_paper_ids)

        for paper_id, authors_list in paper_data.items():
            num_authors_on_paper = len(authors_list)
            num_affiliated_on_paper = 0
            num_unaffiliated_on_paper = 0

            if num_authors_on_paper == 0:
                continue

            for _, country in authors_list:
                if is_affiliated(country):
                    num_affiliated_on_paper += 1
                else:
                    num_unaffiliated_on_paper += 1

            # Check paper conditions
            if num_unaffiliated_on_paper > 0:
                results["papers_at_least_one_unaffiliated"] += 1
            if num_unaffiliated_on_paper == num_authors_on_paper:
                results["papers_all_unaffiliated"] += 1
                # --- Add paper ID (and optionally title) to the list ---
                results["paper_ids_all_unaffiliated"].append(paper_id)
                # Optional: Store title as well
                # results["paper_details_all_unaffiliated"].append(
                #     {"id": paper_id, "title": paper_titles.get(paper_id, "Title Not Found")}
                # )
                # --- End Addition ---
            if num_affiliated_on_paper > 0:
                results["papers_at_least_one_affiliated"] += 1
            if num_affiliated_on_paper == num_authors_on_paper:
                results["papers_all_affiliated"] += 1

        # Sort the list of paper IDs for consistent output
        results["paper_ids_all_unaffiliated"].sort()
        # Optional: Sort details list if using titles
        # results["paper_details_all_unaffiliated"].sort(key=lambda x: x['id'])

        return results

    except sqlite3.Error as e:
        print(f"Database error: {e}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    finally:
        if conn:
            conn.close()

# --- Main Execution ---
if __name__ == "__main__":
    # In an IPython notebook, you might just call analyze_affiliations directly
    # without the if __name__ == "__main__": guard, if you prefer.
    stats = analyze_affiliations(DATABASE_FILE)

    if stats is not None:
        total_authors = stats["total_authors_on_accepted"]
        total_papers = stats["total_accepted_papers"]

        print("--- Analysis Results (Based on Accepted Papers) ---")
        print(f"\nTotal Accepted Papers: {total_papers}")
        print(f"Total Unique Authors on Accepted Papers: {total_authors}")

        print("\n--- Author Affiliation (Country - excluding NULL, empty, 'UNK') ---")
        if total_authors > 0:
            print(format_output(
                "Authors with Affiliation Country (at least once)",
                stats["authors_with_country"],
                calculate_percentage(stats["authors_with_country"], total_authors)
            ))
            print(format_output(
                "Authors without Affiliation Country (only invalid/missing entries)",
                stats["authors_without_country"],
                calculate_percentage(stats["authors_without_country"], total_authors)
            ))
        else:
             print("No authors found on accepted papers to analyze.")


        print("\n--- Paper Affiliation Status (Country - excluding NULL, empty, 'UNK') ---")
        if total_papers > 0:
            print(format_output(
                "Papers with at least 1 Unaffiliated Author",
                stats["papers_at_least_one_unaffiliated"],
                calculate_percentage(stats["papers_at_least_one_unaffiliated"], total_papers)
            ))
            print(format_output(
                "Papers with ALL Authors Unaffiliated",
                stats["papers_all_unaffiliated"],
                calculate_percentage(stats["papers_all_unaffiliated"], total_papers)
            ))
            print(format_output(
                "Papers with at least 1 Affiliated Author",
                stats["papers_at_least_one_affiliated"],
                calculate_percentage(stats["papers_at_least_one_affiliated"], total_papers)
            ))
            print(format_output(
                "Papers with ALL Authors Affiliated",
                stats["papers_all_affiliated"],
                calculate_percentage(stats["papers_all_affiliated"], total_papers)
            ))
        else:
            print("No accepted papers found to analyze.")

        # --- Print the list of papers where all authors are unaffiliated ---
        print("\n--- Papers with ALL Authors Unaffiliated ---")
        all_unaffiliated_ids = stats["paper_ids_all_unaffiliated"]
        if all_unaffiliated_ids:
            print(f"Found {len(all_unaffiliated_ids)} paper(s):")
            for paper_id in all_unaffiliated_ids:
                 print(f"  - ID: {paper_id}")
            # If you uncommented the title fetching and storage:
            # all_unaffiliated_details = stats["paper_details_all_unaffiliated"]
            # for paper_info in all_unaffiliated_details:
            #     print(f"  - ID: {paper_info['id']}, Title: {paper_info['title']}")
        else:
            print("No papers found where all authors are unaffiliated.")
        # --- End Printing List ---

    else:
        print("Analysis could not be completed due to errors.")

--- Analysis Results (Based on Accepted Papers) ---

Total Accepted Papers: 4035
Total Unique Authors on Accepted Papers: 14821

--- Author Affiliation (Country - excluding NULL, empty, 'UNK') ---
Authors with Affiliation Country (at least once): 14745 (99.49%)
Authors without Affiliation Country (only invalid/missing entries): 76 (0.51%)

--- Paper Affiliation Status (Country - excluding NULL, empty, 'UNK') ---
Papers with at least 1 Unaffiliated Author: 84 (2.08%)
Papers with ALL Authors Unaffiliated: 1 (0.02%)
Papers with at least 1 Affiliated Author: 4034 (99.98%)
Papers with ALL Authors Affiliated: 3951 (97.92%)

--- Papers with ALL Authors Unaffiliated ---
Found 1 paper(s):
  - ID: 0VeSCjRDBy


--- Affiliation Analysis Results by Venue ---


=== Venue: ICLR 2024 - Track: Conference ===

Total Accepted Papers: 2260
Total Unique Authors on Accepted Papers: 8697

--- Author Affiliation (Country - excluding NULL, empty, 'UNK') ---
Authors with Affiliation Country (at least once): 8151 (93.72%)
Authors without Affiliation Country (only invalid/missing entries): 546 (6.28%)

--- Paper Affiliation Status (Country - excluding NULL, empty, 'UNK') ---
Papers with at least 1 Unaffiliated Author: 441 (19.51%)
Papers with ALL Authors Unaffiliated: 10 (0.44%)
Papers with at least 1 Affiliated Author: 2250 (99.56%)
Papers with ALL Authors Affiliated: 1819 (80.49%)

--- Papers with ALL Authors Unaffiliated ---
Found 10 paper(s):
  IDs: AgM3MzT99c, GIUjLsDP4Z, PoDkdFQIu3, Sy8upuD6Bw, aN4Jf6Cx69, lrQlLqQase, oDdzXQzP2F, p34fRKp8qA, vY9nzQmQBw, ym0ubZrsmm

--- End of Analysis ---
