In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
def list_submissions():
    root_dir = Path('../submissions')
    data = []

    for file_path in root_dir.rglob('*.csv'):
        # Expecting path: submissions/source/disease/model/file.csv
        try:
            source = file_path.parts[2]
            disease = file_path.parts[3]
            model = file_path.parts[4]
            filename = file_path.name

            data.append({
                'source': source,
                'disease': disease,
                'model': model,
                'filename': filename
            })
        except IndexError:
            print(f"Skipping malformed path: {file_path}")

    df = pd.DataFrame(data)

    df['date'] = df.filename.apply(lambda x: x[:10])
    df = df[df.date.between('2024-10-17', '2025-03-27')]
    df = df[~df.date.isin(['2024-12-26', '2025-01-02'])]
    df = df[~((df.model == 'MPIDS-PS_embedding') & (df.source == 'survstat'))]
    
    return df

In [3]:
df = list_submissions()

In [4]:
df

Unnamed: 0,source,disease,model,filename,date
0,agi,are,MPIDS-PS_embedding,2025-02-13-agi-are-MPIDS-PS_embedding.csv,2025-02-13
1,agi,are,MPIDS-PS_embedding,2025-03-13-agi-are-MPIDS-PS_embedding.csv,2025-03-13
2,agi,are,MPIDS-PS_embedding,2024-10-17-agi-are-MPIDS-PS_embedding.csv,2024-10-17
3,agi,are,MPIDS-PS_embedding,2024-11-07-agi-are-MPIDS-PS_embedding.csv,2024-11-07
4,agi,are,MPIDS-PS_embedding,2024-12-19-agi-are-MPIDS-PS_embedding.csv,2024-12-19
...,...,...,...,...,...
911,survstat,influenza,KIT-epinowcast,2024-11-28-survstat-influenza-KIT-epinowcast.csv,2024-11-28
913,survstat,influenza,KIT-epinowcast,2025-02-06-survstat-influenza-KIT-epinowcast.csv,2025-02-06
914,survstat,influenza,KIT-epinowcast,2025-01-09-survstat-influenza-KIT-epinowcast.csv,2025-01-09
915,survstat,influenza,KIT-epinowcast,2025-03-13-survstat-influenza-KIT-epinowcast.csv,2025-03-13


In [5]:
df.groupby(['source', 'disease', 'model'])['date'].nunique().reset_index(name='n_dates')

Unnamed: 0,source,disease,model,n_dates
0,agi,are,KIT-LightGBM,22
1,agi,are,KIT-TSMixer,22
2,agi,are,KIT-hhh4,22
3,agi,are,KIT-simple_nowcast,22
4,agi,are,MPIDS-PS_embedding,22
5,icosari,sari,KIT-LightGBM,22
6,icosari,sari,KIT-MeanEnsemble,22
7,icosari,sari,KIT-TSMixer,22
8,icosari,sari,KIT-epinowcast,22
9,icosari,sari,KIT-hhh4,22


In [6]:
# influenza: KIT-epinowcast, RIVM-GAM: 2024-11-21 
# rsv: KIT-simple_nowcast: 2025-01-30

In [7]:
def combine_submissions():
    df_submissions = list_submissions()
      
    dfs = []
    for _, row in df_submissions.iterrows():
        file_path = f"../submissions/{row['source']}/{row['disease']}/{row['model']}/{row['filename']}"
        
        if os.path.exists(file_path):
            try:
                df_temp = pd.read_csv(file_path)
                df_temp['source'] = row['source']
                df_temp['disease'] = row['disease']
                df_temp['model'] = row['model']
                dfs.append(df_temp)
            except Exception as e:
                print(f"Failed to load {file_path}: {e}")
        else:
            print(f"File not found: {file_path}")

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

In [8]:
def determine_level(row):
    if row['location'] == 'DE' and row['age_group'] == '00+':
        return 'national'
    elif row['location'] != 'DE':
        return 'states'
    elif row['location'] == 'DE' and row['age_group'] != '00+':
        return 'age'
    else:
        return 'unknown'

In [9]:
df = combine_submissions()

In [10]:
df['level'] = df.apply(determine_level, axis=1)

In [11]:
df.to_csv('../data/submissions.csv', index=False)

In [12]:
df[(df.location == 'DE') & (df.age_group == '00+') & (df.type == 'quantile')].groupby(['source', 'disease', 'model']).size().reset_index(name='size')

Unnamed: 0,source,disease,model,size
0,agi,are,KIT-LightGBM,616
1,agi,are,KIT-TSMixer,616
2,agi,are,KIT-hhh4,616
3,agi,are,KIT-simple_nowcast,1768
4,agi,are,MPIDS-PS_embedding,616
5,icosari,sari,KIT-LightGBM,616
6,icosari,sari,KIT-MeanEnsemble,616
7,icosari,sari,KIT-TSMixer,616
8,icosari,sari,KIT-epinowcast,616
9,icosari,sari,KIT-hhh4,616


In [13]:
df[(df.location == 'DE') & (df.type == 'quantile')].groupby(['source', 'disease', 'model']).size().reset_index(name='size')

Unnamed: 0,source,disease,model,size
0,agi,are,KIT-LightGBM,3696
1,agi,are,KIT-TSMixer,3696
2,agi,are,KIT-hhh4,3696
3,agi,are,KIT-simple_nowcast,10608
4,agi,are,MPIDS-PS_embedding,616
5,icosari,sari,KIT-LightGBM,4312
6,icosari,sari,KIT-MeanEnsemble,4312
7,icosari,sari,KIT-TSMixer,4312
8,icosari,sari,KIT-epinowcast,4312
9,icosari,sari,KIT-hhh4,4312


In [14]:
df[(df.source == 'icosari') & (df.location == 'DE') & (df.model == 'KIT-MeanEnsemble')].groupby('forecast_date').size()

forecast_date
2024-10-17    196
2024-10-24    196
2024-10-31    196
2024-11-07    196
2024-11-14    196
2024-11-21    196
2024-11-28    196
2024-12-05    196
2024-12-12    196
2024-12-19    196
2025-01-09    196
2025-01-16    196
2025-01-23    196
2025-01-30    196
2025-02-06    196
2025-02-13    196
2025-02-20    196
2025-02-27    196
2025-03-06    196
2025-03-13    196
2025-03-20    196
2025-03-27    196
dtype: int64

In [13]:
df[(df.source == 'icosari') & (df.location == 'DE') & (df.model == 'KIT-MeanEnsemble')].groupby('forecast_date').size()

forecast_date
2024-10-17    196
2024-10-24    196
2024-10-31    196
2024-11-07    196
2024-11-14    196
2024-11-21    196
2024-11-28    196
2024-12-05    196
2024-12-12    196
2024-12-19    196
2025-01-09    196
2025-01-16    196
2025-01-23    196
2025-01-30    196
2025-02-06    196
2025-02-13    196
2025-02-20    196
2025-02-27    196
2025-03-06    392
2025-03-13    196
2025-03-20    196
2025-03-27    196
dtype: int64