In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('papers.tsv', sep='\t')

In [2]:
df

Unnamed: 0,Heading,Paper,Paper-Link,Tweet-Link,Other-Links
0,Top ML Papers of the Week (June 23 - June 29) ...,Ultra-Fast Diffusion-based Language Models Th...,https://arxiv.org/abs/2506.17298,https://x.com/omarsar0/status/1937600372430045494,
1,Top ML Papers of the Week (June 23 - June 29) ...,"MEM1 This work introduces MEM1, an RL framewo...",https://arxiv.org/abs/2506.15841,https://x.com/omarsar0/status/1937252072954691813,
2,Top ML Papers of the Week (June 23 - June 29) ...,Towards AI Search Paradigm Proposes a modular...,https://arxiv.org/abs/2506.17188,https://x.com/omarsar0/status/1937161765604692400,
3,Top ML Papers of the Week (June 23 - June 29) ...,Reinforcement-Learned Teachers of Test Time Sc...,https://www.arxiv.org/abs/2506.08388,https://x.com/SakanaAILabs/status/193696584118...,
4,Top ML Papers of the Week (June 23 - June 29) ...,"DeepRare Introduces DeepRare, a modular agent...",https://arxiv.org/abs/2506.20430,https://x.com/omarsar0/status/1938256196626153624,
...,...,...,...,...,...
1295,Top ML Papers of the Week (Jan 1-8),Large Language Models as Corporate Lobbyists,https://arxiv.org/abs/2301.01181,https://twitter.com/dair_ai/status/16121531063...,"{""Code"": ""https://github.com/JohnNay/llm-lobby..."
1296,Top ML Papers of the Week (Jan 1-8),"Superposition, Memorization, and Double Descent",https://transformer-circuits.pub/2023/toy-doub...,https://twitter.com/dair_ai/status/16121531084...,
1297,Top ML Papers of the Week (Jan 1-8),StitchNet: Composing Neural Networks from Pre-...,https://arxiv.org/abs/2301.01947,https://twitter.com/dair_ai/status/16121531104...,
1298,Top ML Papers of the Week (Jan 1-8),Iterated Decomposition: Improving Science Q\&A...,https://arxiv.org/abs/2301.01751,https://twitter.com/dair_ai/status/16121531126...,"{""Code"": ""https://github.com/oughtinc/ice""}"


In [3]:
import re
from datetime import datetime

def extract_end_date(heading):
    # Try to extract the year
    year_match = re.search(r'- (\d{4})$', heading)
    year = int(year_match.group(1)) if year_match else 2023

    # Try to extract the second date
    date_range_match = re.search(r'\((.*?)\)', heading)
    if not date_range_match:
        return None

    date_range = date_range_match.group(1)
    
    # Try to extract the second part of the date range
    parts = re.split(r'[-–]', date_range)
    if len(parts) != 2:
        return None

    start, end = parts[0].strip(), parts[1].strip()

    # If the end date has no month, use month from start
    if re.match(r'^\d+$', end):
        start_month = re.match(r'^[A-Za-z]+', start)
        if start_month:
            end = f"{start_month.group(0)} {end}"

    # Parse the end date
    try:
        parsed_date = datetime.strptime(f"{end} {year}", "%B %d %Y")
    except ValueError:
        try:
            parsed_date = datetime.strptime(f"{end} {year}", "%b %d %Y")
        except ValueError:
            return None

    return parsed_date.strftime("%Y-%m-%d")

df["Week"] = df["Heading"].apply(extract_end_date)
df

Unnamed: 0,Heading,Paper,Paper-Link,Tweet-Link,Other-Links,Week
0,Top ML Papers of the Week (June 23 - June 29) ...,Ultra-Fast Diffusion-based Language Models Th...,https://arxiv.org/abs/2506.17298,https://x.com/omarsar0/status/1937600372430045494,,2025-06-29
1,Top ML Papers of the Week (June 23 - June 29) ...,"MEM1 This work introduces MEM1, an RL framewo...",https://arxiv.org/abs/2506.15841,https://x.com/omarsar0/status/1937252072954691813,,2025-06-29
2,Top ML Papers of the Week (June 23 - June 29) ...,Towards AI Search Paradigm Proposes a modular...,https://arxiv.org/abs/2506.17188,https://x.com/omarsar0/status/1937161765604692400,,2025-06-29
3,Top ML Papers of the Week (June 23 - June 29) ...,Reinforcement-Learned Teachers of Test Time Sc...,https://www.arxiv.org/abs/2506.08388,https://x.com/SakanaAILabs/status/193696584118...,,2025-06-29
4,Top ML Papers of the Week (June 23 - June 29) ...,"DeepRare Introduces DeepRare, a modular agent...",https://arxiv.org/abs/2506.20430,https://x.com/omarsar0/status/1938256196626153624,,2025-06-29
...,...,...,...,...,...,...
1295,Top ML Papers of the Week (Jan 1-8),Large Language Models as Corporate Lobbyists,https://arxiv.org/abs/2301.01181,https://twitter.com/dair_ai/status/16121531063...,"{""Code"": ""https://github.com/JohnNay/llm-lobby...",2023-01-08
1296,Top ML Papers of the Week (Jan 1-8),"Superposition, Memorization, and Double Descent",https://transformer-circuits.pub/2023/toy-doub...,https://twitter.com/dair_ai/status/16121531084...,,2023-01-08
1297,Top ML Papers of the Week (Jan 1-8),StitchNet: Composing Neural Networks from Pre-...,https://arxiv.org/abs/2301.01947,https://twitter.com/dair_ai/status/16121531104...,,2023-01-08
1298,Top ML Papers of the Week (Jan 1-8),Iterated Decomposition: Improving Science Q\&A...,https://arxiv.org/abs/2301.01751,https://twitter.com/dair_ai/status/16121531126...,"{""Code"": ""https://github.com/oughtinc/ice""}",2023-01-08


In [4]:
df.drop(columns=['Heading'], inplace=True, errors='ignore')

In [5]:
df[df["Paper-Link"].str.contains("arxiv", case=False, na=False)]

Unnamed: 0,Paper,Paper-Link,Tweet-Link,Other-Links,Week
0,Ultra-Fast Diffusion-based Language Models Th...,https://arxiv.org/abs/2506.17298,https://x.com/omarsar0/status/1937600372430045494,,2025-06-29
1,"MEM1 This work introduces MEM1, an RL framewo...",https://arxiv.org/abs/2506.15841,https://x.com/omarsar0/status/1937252072954691813,,2025-06-29
2,Towards AI Search Paradigm Proposes a modular...,https://arxiv.org/abs/2506.17188,https://x.com/omarsar0/status/1937161765604692400,,2025-06-29
3,Reinforcement-Learned Teachers of Test Time Sc...,https://www.arxiv.org/abs/2506.08388,https://x.com/SakanaAILabs/status/193696584118...,,2025-06-29
4,"DeepRare Introduces DeepRare, a modular agent...",https://arxiv.org/abs/2506.20430,https://x.com/omarsar0/status/1938256196626153624,,2025-06-29
...,...,...,...,...,...
1294,ConvNeXt V2: Co-designing and Scaling ConvNets...,https://arxiv.org/abs/2301.00808,https://twitter.com/dair_ai/status/16121531043...,"{""Code"": ""https://github.com/facebookresearch/...",2023-01-08
1295,Large Language Models as Corporate Lobbyists,https://arxiv.org/abs/2301.01181,https://twitter.com/dair_ai/status/16121531063...,"{""Code"": ""https://github.com/JohnNay/llm-lobby...",2023-01-08
1297,StitchNet: Composing Neural Networks from Pre-...,https://arxiv.org/abs/2301.01947,https://twitter.com/dair_ai/status/16121531104...,,2023-01-08
1298,Iterated Decomposition: Improving Science Q\&A...,https://arxiv.org/abs/2301.01751,https://twitter.com/dair_ai/status/16121531126...,"{""Code"": ""https://github.com/oughtinc/ice""}",2023-01-08


In [7]:
df["has_arxiv"] = df["Paper-Link"].str.contains("arxiv", case=False, na=False)

In [9]:
df.to_csv('papers_clean.tsv', sep='\t', index=False)

In [8]:
# Group by end_date and count papers with and without arxiv links
arxiv_summary = df.groupby(["Week", "has_arxiv"]).size().unstack(fill_value=0)
arxiv_summary.columns = ["no_arxiv", "has_arxiv"]
arxiv_summary.head(20)

Unnamed: 0_level_0,no_arxiv,has_arxiv
Week,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-08,2,8
2023-01-15,2,8
2023-01-22,1,9
2023-01-29,1,9
2023-02-05,0,10
2023-02-12,0,10
2023-02-19,1,9
2023-02-26,1,9
2023-03-05,3,7
2023-03-12,1,9
