In [2]:
%pip install pandas matplotlib

/Users/elvish/Documents/PersonalCODING/indiaml-tracker/eda/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib
import sqlite3
import numpy as np

import pandas as pd
import plotly.express as px
import pycountry

In [3]:
conn = sqlite3.Connection("venues-final.db")

In [4]:
def get_df_by_conference(conf):
    df = pd.read_sql_query(f"SELECT author_id, full_name, openreview_id, affiliation_country FROM (SELECT * FROM paper_authors JOIN authors on author_id = authors.id JOIN papers on paper_id = papers.id JOIN venue_infos on papers.venue_info_id = venue_infos.id) where conference = '{conf}' and affiliation_country != 'UNK'", conn)
    return df


# Function to convert country names to ISO Alpha-3 codes
def country_to_iso(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except:
        return None
    
    
def plt_author_df(country_counts, title, threshold = 6000):

    # # Count authors per country
    # country_counts = df['affiliation_country'].value_counts().reset_index()
    country_counts.columns = ['country', 'author_count']


    # Add ISO codes to the DataFrame
    country_counts['iso_code'] = country_counts['country'].apply(country_to_iso)

    # Drop rows with missing ISO codes (optional: investigate these manually)
    country_counts.dropna(subset=['iso_code'], inplace=True)


    max_count = country_counts['author_count'].max()  # e.g., 6700

    # Custom transformation: log below threshold, linear above
    def transform_count(x):
        if x <= threshold:
            return np.log10(x + 1)  # Use log10 for interpretability
        else:
            return np.log10(threshold + 1) + (x - threshold) / (max_count - threshold)

    # Apply the transformation and normalize to [0, 1]
    country_counts['transformed'] = country_counts['author_count'].apply(transform_count)
    t_min = np.log10(1)  # Minimum transformed value (log10(0+1)=0)
    t_max = np.log10(threshold + 1) + 1  # Max transformed value (log10(6001) + 1)
    country_counts['normalized'] = (country_counts['transformed'] - t_min) / (t_max - t_min)

    # Define key ticks for the color bar (log below 6000, linear above)
    tick_values = [1, 10, 100, 1000, threshold, max_count]
    tick_transformed = [transform_count(v) for v in tick_values]
    tick_normalized = [(t - t_min) / (t_max - t_min) for t in tick_transformed]


    # Create the choropleth map
    fig = px.choropleth(
        country_counts,
        locations="iso_code",
        color="normalized",
        hover_name="country",
        hover_data={"author_count": True, "normalized": False},  # Hide normalized values in hover
        color_continuous_scale=px.colors.sequential.Mint,
        title=title,
        labels={'author_count': 'Authors'}
    )

    fig.update_coloraxes(
        colorbar=dict(
            title="Number of Authors",
            tickvals=tick_normalized,
            ticktext=[str(v) for v in tick_values]
        )
    )

    # Customize layout
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='natural earth'
        ),
        margin={"r":0, "t":40, "l":0, "b":0}
    )
    return fig


## India at NeurIPS 2024 Conference

In [5]:
df = get_df_by_conference('NeurIPS')
country_counts = df['affiliation_country'].value_counts().reset_index()


plt_author_df(country_counts, "NeurIPS 2024 Conference, Number of Authors by Country", threshold=6000)

In [None]:
def get_paper_counts_by_conference(conf):
    query = f"""
        SELECT affiliation_country, COUNT(DISTINCT paper_id) as paper_count 
        FROM (
            SELECT * 
            FROM paper_authors 
            JOIN authors ON author_id = authors.id 
            JOIN papers ON paper_id = papers.id 
            JOIN venue_infos ON papers.venue_info_id = venue_infos.id
        ) 
        WHERE conference = '{conf}' AND affiliation_country != 'UNK'
        GROUP BY affiliation_country
    """
    df = pd.read_sql_query(query, conn)
    return df

def plt_paper_df(paper_counts, title, threshold=1000):
    # Rename columns for consistency
    paper_counts = paper_counts.rename(columns={'affiliation_country': 'country', 'paper_count': 'count'})
    
    # Convert country names to ISO codes
    paper_counts['iso_code'] = paper_counts['country'].apply(country_to_iso)
    paper_counts.dropna(subset=['iso_code'], inplace=True)
    
    # Apply log-linear transformation
    max_count = paper_counts['count'].max()
    def transform_count(x):
        if x <= threshold:
            return np.log10(x + 1)
        else:
            return np.log10(threshold + 1) + (x - threshold) / (max_count - threshold)
    
    paper_counts['transformed'] = paper_counts['count'].apply(transform_count)
    t_min = np.log10(1)
    t_max = np.log10(threshold + 1) + 1
    paper_counts['normalized'] = (paper_counts['transformed'] - t_min) / (t_max - t_min)
    
    # Generate ticks for color bar
    tick_values = [1, 10, 100, threshold, max_count]
    valid_ticks = [v for v in tick_values if v <= max_count]
    if max_count not in valid_ticks:
        valid_ticks.append(max_count)
    valid_ticks = sorted(valid_ticks)
    
    tick_transformed = [transform_count(v) for v in valid_ticks]
    tick_normalized = [(t - t_min) / (t_max - t_min) for t in tick_transformed]
    
    # Create choropleth map
    fig = px.choropleth(
        paper_counts,
        locations="iso_code",
        color="normalized",
        hover_name="country",
        hover_data={"count": True, "normalized": False},
        color_continuous_scale=px.colors.sequential.Mint,
        title=title,
        labels={'count': 'Papers'}
    )
    
    fig.update_coloraxes(
        colorbar=dict(
            title="Number of Papers",
            tickvals=tick_normalized,
            ticktext=[str(v) for v in valid_ticks]
        )
    )
    
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='natural earth'
        ),
        margin={"r":0, "t":40, "l":0, "b":0}
    )
    return fig

In [None]:
conf = 'NeurIPS'
paper_counts_df = get_paper_counts_by_conference(conf)

# Generate and show the plot
fig = plt_paper_df(paper_counts_df, f'Paper Distribution by Country at {conf}', threshold=1000)
fig.show()

In [None]:
squeel = """SELECT title, paper_id, pdf_url, full_name, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE affiliation_country = 'IN'
and conference = 'NeurIPS'
and year = '2024'
and track = 'Conference';
"""


ddf = pd.read_sql_query(squeel, conn)
top_indian_authors = ddf[(ddf['position'] == 0) & (ddf['affiliation_country'] == 'IN')]

papers_with_top_indian_authors = top_indian_authors[['title', 'paper_id', 'pdf_url', 'full_name']]
papers_with_top_indian_authors["pdf_url"] = "https://openreview.net" + papers_with_top_indian_authors["pdf_url"]
papers_with_top_indian_authors



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title,paper_id,pdf_url,full_name
3,Iteratively Refined Early Interaction Alignmen...,udTwwF7tks,https://openreview.net/pdf/4ae7afbdb746c97e2bf...,Ashwin Ramachandran
12,DisCEdit: Model Editing by Identifying Discrim...,tuiqq1G8I5,https://openreview.net/pdf/c7df4a98dbe1cbf0192...,Chiranjib Bhattacharyya
14,Time-Reversal Provides Unsupervised Feedback t...,nY0BrZdqLt,https://openreview.net/pdf/08ee7a3ea3b3fd8e7bf...,Yerram Varun
18,UGC: Universal Graph Coarsening,nN6NSd1Qds,https://openreview.net/pdf/7835e160109c7dd1023...,Mohit Kataria
21,GraphTrail: Translating GNN Predictions into H...,fzlMza6dRZ,https://openreview.net/pdf/476e9d9dcd1fe989b9e...,Burouj Armgaan
26,Optimal Top-Two Method for Best Arm Identifica...,YXQW4qQe2U,https://openreview.net/pdf/4c086ee463f48a315e6...,Agniv Bandyopadhyay
32,Optimal Algorithms for Online Convex Optimizat...,TxffvJMnBy,https://openreview.net/pdf/269618651cc4c3886e1...,Abhishek Sinha
38,FUGAL: Feature-fortified Unrestricted Graph Al...,SdLOs1FR4h,https://openreview.net/pdf/17aef8ff1d289ba15e7...,Aditya Bommakanti
41,Accuracy is Not All You Need,QVG7j29Sta,https://openreview.net/pdf/7ecf82af167db2fa1a4...,Abhinav Dutta
46,Mitigating Biases in Blackbox Feature Extracto...,HwO1mNluoL,https://openreview.net/pdf/3ccdc8010ed3b8d6389...,Abhipsa Basu


In [None]:
def process_papers(ddf, cc='IN'):
    filtered_papers = []
    
    # Group by paper_id to process each paper individually
    grouped = ddf.groupby('paper_id')
    
    for paper_id, group in grouped:
        paper_title = group['title'].iloc[0]
        pdf_url = group['pdf_url'].iloc[0]
        authors = group[['full_name', 'affiliation_country', 'position']].drop_duplicates()
        
        # Filter authors from India
        indian_authors = authors[authors['affiliation_country'] == cc]
        
        if not indian_authors.empty:
            # Create author list
            author_list = authors['full_name'].tolist()
            
            # Check if top author is from India
            top_author_from_india = (authors.sort_values(by='position').iloc[0]['affiliation_country'] == cc)
            
            # Check if majority authors are from India
            majority_authors_from_india = len(indian_authors) / len(authors) >= 0.5
            
            filtered_papers.append({
                'paper_title': paper_title,
                'pdf_url': pdf_url,
                'author_list': author_list,
                'top_author_from_india': top_author_from_india,
                'majority_authors_from_india': majority_authors_from_india
            })
    
    return pd.DataFrame(filtered_papers)

In [None]:
squeel = """SELECT title, paper_id, pdf_url, full_name, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE conference = 'NeurIPS'
and year = '2024'
and track = 'Conference';
"""

ddf = pd.read_sql_query(squeel, conn)


kdf = process_papers(ddf)

kdf["pdf_url"] = "http://openreview.net"+ kdf["pdf_url"]

kdf

Unnamed: 0,paper_title,pdf_url,author_list,top_author_from_india,majority_authors_from_india
0,Sample-Efficient Constrained Reinforcement Lea...,http://openreview.net/pdf/b178b4957c9f40578fc6...,"[Washim Uddin Mondal, Vaneet Aggarwal]",True,True
1,Tiny Time Mixers (TTMs): Fast Pre-trained Mode...,http://openreview.net/pdf/c1a7cea36450273599d6...,"[Vijay Ekambaram, Arindam Jati, , , , , , ]",False,False
2,Learning General Parameterized Policies for In...,http://openreview.net/pdf/500b125b905014d27b7d...,"[Qinbo Bai, Washim Uddin Mondal, Vaneet Aggarwal]",False,False
3,Text2CAD: Generating Sequential CAD Designs fr...,http://openreview.net/pdf/9f3142a99be0b0075f8f...,"[Mohammad Sadil Khan, Sankalp Sinha, Sheikh Ta...",False,False
4,COLD: Causal reasOning in cLosed Daily activities,http://openreview.net/pdf/015330ba1dcf5481f996...,"[Abhinav Joshi, Areeb Ahmad, Ashutosh Modi]",True,True
5,Near-Optimal Streaming Heavy-Tailed Statistica...,http://openreview.net/pdf/492b971c4dfd9caecfce...,"[Aniket Das, Dheeraj Mysore Nagaraj, Soumyabra...",False,False
6,Generalized Linear Bandits with Limited Adapti...,http://openreview.net/pdf/e2e8732933515749d301...,"[Ayush Sawarni, Nirjhar Das, , Gaurav Sinha]",False,True
7,Mixture of Nested Experts: Adaptive Processing...,http://openreview.net/pdf/34a8b1b90acda923db44...,"[Gagan Jain, Nidhi Hegde, Aditya Kusupati, Ars...",True,False
8,Mitigating Biases in Blackbox Feature Extracto...,http://openreview.net/pdf/3ccdc8010ed3b8d63897...,"[Abhipsa Basu, Venkatesh Babu Radhakrishnan]",True,True
9,Quantifying the Gain in Weak-to-Strong General...,http://openreview.net/pdf/36318fbff6dff7b5a38a...,"[Moses Charikar, , Kirankumar Shiragur]",False,False


## India at ICML 2024 Conference

In [6]:
df = get_df_by_conference('ICML')
country_counts = df['affiliation_country'].value_counts().reset_index()


plt_author_df(country_counts, "ICML 2024 Conference, Number of Authors by Country", threshold=2900)

In [None]:
conf = 'ICML'
paper_counts_df = get_paper_counts_by_conference(conf)

# Generate and show the plot
fig = plt_paper_df(paper_counts_df, f'Paper Distribution by Country at {conf}', threshold=1000)
fig.show()

In [None]:
squeel = """SELECT title, paper_id, pdf_url, full_name, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE affiliation_country = 'IN'
and conference = 'ICML'
and year = '2024'
and track = 'Conference';
"""


ddf = pd.read_sql_query(squeel, conn)
top_indian_authors = ddf[(ddf['position'] == 0) & (ddf['affiliation_country'] == 'IN')]

papers_with_top_indian_authors = top_indian_authors[['title', 'paper_id', 'pdf_url', 'full_name']]
papers_with_top_indian_authors["pdf_url"] = "https://openreview.net" + papers_with_top_indian_authors["pdf_url"]
papers_with_top_indian_authors



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,title,paper_id,pdf_url,full_name
0,Provably Robust DPO: Aligning Language Models ...,yhpDKSw7yA,https://openreview.net/pdf/3f1185c2cb457015bf0...,Sayak Ray Chowdhury
4,Finite Time Logarithmic Regret Bounds for Self...,tTtSnpH4fc,https://openreview.net/pdf/a7f969be6c3977786d1...,Rahul Singh
9,PairNet: Training with Observed Pairs to Estim...,o5SVr80Rgg,https://openreview.net/pdf/8542c0b81c30deb10c2...,Lokesh Nagalapatti
13,On Online Experimentation without Device Ident...,merZTLSdC9,https://openreview.net/pdf/0615d619c3602e2d636...,
14,PIPER: Primitive-Informed Preference-based Hie...,l6Hef6FVd0,https://openreview.net/pdf/486d2e3f8b1d6cd84cc...,Utsav Singh
16,Agnostic Learning of Mixed Linear Regressions ...,eo88noTbb5,https://openreview.net/pdf/17a75d2d4ceb5ac28e3...,Avishek Ghosh
17,Submodular framework for structured-sparse opt...,bfQCO9Vqhk,https://openreview.net/pdf/15e9e2ce1a2a6304834...,
21,Testing the Feasibility of Linear Programs wit...,TfwGtfPkhV,https://openreview.net/pdf/a89b112bea8fde3d1e9...,Aditya Gopalan
22,Tandem Transformers for Inference Efficient LLMs,TN3fi7dwPo,https://openreview.net/pdf/c14bbb8bb8638326ffa...,Aishwarya P S
23,How Far Can Fairness Constraints Help Recover ...,RfQT6vJt8b,https://openreview.net/pdf/409965515edccbc49d3...,mohit sharma


In [None]:
squeel = """SELECT title, paper_id, pdf_url, full_name, position, affiliation_name, affiliation_domain, affiliation_country FROM ( SELECT * 
FROM paper_authors 
JOIN authors ON author_id = authors.id 
JOIN papers ON paper_id = papers.id 
JOIN venue_infos ON papers.venue_info_id = venue_infos.id)

WHERE conference = 'ICML'
and year = '2024'
and track = 'Conference';
"""

ddf = pd.read_sql_query(squeel, conn)


kdf = process_papers(ddf)

kdf["pdf_url"] = "http://openreview.net"+ kdf["pdf_url"]

kdf

Unnamed: 0,paper_title,pdf_url,author_list,top_author_from_india,majority_authors_from_india
0,Total Variation Distance Meets Probabilistic I...,http://openreview.net/pdf/e6ffa80506c26dd3909c...,"[Arnab Bhattacharyya, Sutanu Gayen, Kuldeep S....",False,False
1,Online Matrix Completion: A Collaborative Appr...,http://openreview.net/pdf/a47b0150343c512aed14...,"[Dheeraj Baby, Soumyabrata Pal]",False,True
2,Risk Estimation in a Markov Cost Process: Lowe...,http://openreview.net/pdf/61c68b6da42e9185837b...,"[, Prashanth L. A., ]",True,True
3,WISER: Weak Supervision and Supervised Represe...,http://openreview.net/pdf/07f74f6818e7261951b0...,"[Kumar Shubham, Aishwarya Jayagopal, Syed Moha...",True,True
4,Centralized Selection with Preferences in the ...,http://openreview.net/pdf/a277d91287831c2afa5e...,"[L. Elisa Celis, Amit Kumar, Nisheeth K. Vishnoi]",False,False
5,OAK: Enriching Document Representations using ...,http://openreview.net/pdf/c0530913b6c22d453b23...,"[Shikhar Mohan, Deepak Saini, Anshul Mittal, S...",True,True
6,Weakly Convex Regularisers for Inverse Problem...,http://openreview.net/pdf/b0eb7cb8f2998339c2df...,"[Zakhar Shumaylov, Jeremy Budd, Subhadip Mukhe...",False,False
7,Representation Surgery: Theory and Practice of...,http://openreview.net/pdf/b913a137f582939ceef4...,"[Shashwat Singh, , Jonathan Herzig, Roee Aharo...",True,False
8,Causal Discovery with Fewer Conditional Indepe...,http://openreview.net/pdf/23331b236f09db1c4293...,"[Kirankumar Shiragur, Jiaqi Zhang, ]",True,False
9,Enhancing Trajectory Prediction through Self-S...,http://openreview.net/pdf/6f5199a867ae97c4ee3f...,[Pravendra Singh],True,True
