In [2]:
%pip install pandas matplotlib

/Users/elvish/Documents/PersonalCODING/indiaml-tracker/eda/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import matplotlib
import sqlite3
import numpy as np

import pandas as pd
import plotly.express as px
import pycountry

In [2]:
conn = sqlite3.Connection("venues-iclr-2025-v2.db")

In [3]:
def get_df_by_conference(conf):
    df = pd.read_sql_query(f"SELECT author_id, full_name, openreview_id, affiliation_country FROM (SELECT * FROM paper_authors JOIN authors on author_id = authors.id JOIN papers on paper_id = papers.id JOIN venue_infos on papers.venue_info_id = venue_infos.id) where conference = '{conf}' and affiliation_country != 'UNK'", conn)
    return df


# Function to convert country names to ISO Alpha-3 codes
def country_to_iso(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except:
        return None
    
    
def plt_author_df(country_counts, title, threshold = 6000):

    # # Count authors per country
    # country_counts = df['affiliation_country'].value_counts().reset_index()
    country_counts.columns = ['country', 'author_count']


    # Add ISO codes to the DataFrame
    country_counts['iso_code'] = country_counts['country'].apply(country_to_iso)

    # Drop rows with missing ISO codes (optional: investigate these manually)
    country_counts.dropna(subset=['iso_code'], inplace=True)


    max_count = country_counts['author_count'].max()  # e.g., 6700

    # Custom transformation: log below threshold, linear above
    def transform_count(x):
        if x <= threshold:
            return np.log10(x + 1)  # Use log10 for interpretability
        else:
            return np.log10(threshold + 1) + (x - threshold) / (max_count - threshold)

    # Apply the transformation and normalize to [0, 1]
    country_counts['transformed'] = country_counts['author_count'].apply(transform_count)
    t_min = np.log10(1)  # Minimum transformed value (log10(0+1)=0)
    t_max = np.log10(threshold + 1) + 1  # Max transformed value (log10(6001) + 1)
    country_counts['normalized'] = (country_counts['transformed'] - t_min) / (t_max - t_min)

    # Define key ticks for the color bar (log below 6000, linear above)
    tick_values = [1, 10, 100, 1000, threshold, max_count]
    tick_transformed = [transform_count(v) for v in tick_values]
    tick_normalized = [(t - t_min) / (t_max - t_min) for t in tick_transformed]


    # Create the choropleth map
    fig = px.choropleth(
        country_counts,
        locations="iso_code",
        color="normalized",
        hover_name="country",
        hover_data={"author_count": True, "normalized": False},  # Hide normalized values in hover
        color_continuous_scale=px.colors.sequential.Mint,
        title=title,
        labels={'author_count': 'Authors'}
    )

    fig.update_coloraxes(
        colorbar=dict(
            title="Number of Authors",
            tickvals=tick_normalized,
            ticktext=[str(v) for v in tick_values]
        )
    )

    # Customize layout
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='natural earth'
        ),
        margin={"r":0, "t":40, "l":0, "b":0}
    )
    return fig


## India at ICLR 2025 Conference

In [None]:
df = get_df_by_conference('ICLR')
country_counts = df['affiliation_country'].value_counts().reset_index()


plt_author_df(country_counts, "ICLR 2025 Conference, Number of Authors by Country", threshold=6000)

In [5]:
def get_paper_counts_by_conference(conf):
    query = f"""
        SELECT affiliation_country, COUNT(DISTINCT paper_id) as paper_count 
        FROM (
            SELECT * 
            FROM paper_authors 
            JOIN authors ON author_id = authors.id 
            JOIN papers ON paper_id = papers.id 
            JOIN venue_infos ON papers.venue_info_id = venue_infos.id
        ) 
        WHERE conference = '{conf}' AND affiliation_country != 'UNK'
        GROUP BY affiliation_country
    """
    df = pd.read_sql_query(query, conn)
    return df

def plt_paper_df(paper_counts, title, threshold=1000):
    # Rename columns for consistency
    paper_counts = paper_counts.rename(columns={'affiliation_country': 'country', 'paper_count': 'count'})
    
    # Convert country names to ISO codes
    paper_counts['iso_code'] = paper_counts['country'].apply(country_to_iso)
    paper_counts.dropna(subset=['iso_code'], inplace=True)
    
    # Apply log-linear transformation
    max_count = paper_counts['count'].max()
    def transform_count(x):
        if x <= threshold:
            return np.log10(x + 1)
        else:
            return np.log10(threshold + 1) + (x - threshold) / (max_count - threshold)
    
    paper_counts['transformed'] = paper_counts['count'].apply(transform_count)
    t_min = np.log10(1)
    t_max = np.log10(threshold + 1) + 1
    paper_counts['normalized'] = (paper_counts['transformed'] - t_min) / (t_max - t_min)
    
    # Generate ticks for color bar
    tick_values = [1, 10, 100, threshold, max_count]
    valid_ticks = [v for v in tick_values if v <= max_count]
    if max_count not in valid_ticks:
        valid_ticks.append(max_count)
    valid_ticks = sorted(valid_ticks)
    
    tick_transformed = [transform_count(v) for v in valid_ticks]
    tick_normalized = [(t - t_min) / (t_max - t_min) for t in tick_transformed]
    
    # Create choropleth map
    fig = px.choropleth(
        paper_counts,
        locations="iso_code",
        color="normalized",
        hover_name="country",
        hover_data={"count": True, "normalized": False},
        color_continuous_scale=px.colors.sequential.Mint,
        title=title,
        labels={'count': 'Papers'}
    )
    
    fig.update_coloraxes(
        colorbar=dict(
            title="Number of Papers",
            tickvals=tick_normalized,
            ticktext=[str(v) for v in valid_ticks]
        )
    )
    
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='natural earth'
        ),
        margin={"r":0, "t":40, "l":0, "b":0}
    )
    return fig

In [13]:
conf = 'ICLR'
paper_counts_df = get_paper_counts_by_conference(conf)

paper_counts_df.to_json("neurips-2024-cc-papers.json", orient="records")

# Generate and show the plot
fig = plt_paper_df(paper_counts_df, f'Paper Distribution by Country at {conf} 2025', threshold=1000)
fig.show()

In [14]:
squeel = """SELECT title, paper_id, pdf_url, full_name, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE affiliation_country = 'IN'
and conference = 'NeurIPS'
and year = '2024'
and track = 'Conference';
"""


ddf = pd.read_sql_query(squeel, conn)
top_indian_authors = ddf[(ddf['position'] == 0) & (ddf['affiliation_country'] == 'IN')]

papers_with_top_indian_authors = top_indian_authors[['title', 'paper_id', 'pdf_url', 'full_name']]
papers_with_top_indian_authors.loc[:, "pdf_url"] = "https://openreview.net" + papers_with_top_indian_authors["pdf_url"]

In [15]:
def process_papers(ddf, cc='IN'):
    filtered_papers = []
    
    # Group by paper_id to process each paper individually
    grouped = ddf.groupby('paper_id')
    
    for paper_id, group in grouped:
        paper_title = group['title'].iloc[0]
        pdf_url = group['pdf_url'].iloc[0]
        paper_id = group['paper_id'].iloc[0]
        authors = group[['full_name', 'openreview_id', 'affiliation_name', 'affiliation_domain', 'affiliation_country', 'position']].drop_duplicates()
        
        # Filter authors from India
        indian_authors = authors[authors['affiliation_country'] == cc]
        
        if not indian_authors.empty:
            # Create author list
            author_list = authors.apply(lambda row: {
                'name': row['full_name'],
                'openreview_id': row['openreview_id'],
                'affiliation_name': row['affiliation_name'],
                'affiliation_domain': row['affiliation_domain'],
                'affiliation_country': row['affiliation_country']
            }, axis=1).tolist()
            
            # Check if top author is from India
            top_author_from_india = (authors.sort_values(by='position').iloc[0]['affiliation_country'] == cc)
            
            # Check if majority authors are from India
            majority_authors_from_india = len(indian_authors) / len(authors) >= 0.5
            
            filtered_papers.append({
                'paper_title': paper_title,
                'paper_id': paper_id,
                'pdf_url': pdf_url,
                'author_list': author_list,
                'top_author_from_india': top_author_from_india,
                'majority_authors_from_india': majority_authors_from_india
            })
    
    return pd.DataFrame(filtered_papers)

In [10]:
squeel = """SELECT title, paper_id, pdf_url, full_name, openreview_id, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE conference = 'NeurIPS'
and year = '2024'
and track = 'Conference';
"""

ddf = pd.read_sql_query(squeel, conn)

neurips_df = process_papers(ddf)

neurips_df["pdf_url"] = "http://openreview.net"+ neurips_df["pdf_url"]

neurips_df.to_json("iclr-2025-india.json", orient="records")

neurips_df

KeyError: 'pdf_url'