In [1]:
import pandas as pd
from pathlib import Path
import sys
import os

start_year = 2013
end_year = 2023
max_delay = 40

sys.path.append(os.path.abspath('../src'))
project_dir = project_dir = Path.cwd().parent

base_folder_path = project_dir / "data" / "raw" / "counts"
files = [f for f in os.listdir(base_folder_path) if f.endswith('.csv')]

df_list = []
for file in files:
    df = pd.read_csv(os.path.join(base_folder_path, file))
    df = df[['DT_SIN_PRI', 'DT_NOTIFIC']].dropna()
    df.columns = ['Collection date' ,'Submission date']
    df = df.apply(pd.to_datetime, errors='coerce')
    print(df['Collection date'].max())
    
    df = df[
    (df["Collection date"].dt.year >= start_year) & 
    (df["Collection date"].dt.year <= end_year)
    ]
    df = df.sort_values(by='Collection date')
    df['Delay'] = (df['Submission date'] - df['Collection date']).dt.days
    df_list.append(df)

deng_df = pd.concat(df_list, ignore_index=True)
print(deng_df)





  df = pd.read_csv(os.path.join(base_folder_path, file))


2022-01-01 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2021-01-02 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2022-12-31 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2023-12-30 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2018-12-29 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2019-12-28 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2015-01-03 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2016-03-13 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2017-12-30 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2106-05-09 00:00:00


  df = pd.read_csv(os.path.join(base_folder_path, file))


2013-12-28 00:00:00
         Collection date Submission date  Delay
0             2021-01-03      2021-02-05     33
1             2021-01-03      2021-01-20     17
2             2021-01-03      2021-01-06      3
3             2021-01-03      2021-01-13     10
4             2021-01-03      2021-01-05      2
...                  ...             ...    ...
16322086      2013-12-28      2013-12-28      0
16322087      2013-12-28      2013-12-28      0
16322088      2013-12-28      2013-12-28      0
16322089      2013-12-28      2013-12-28      0
16322090      2013-12-28      2013-12-28      0

[16322091 rows x 3 columns]


In [2]:
# Filter out rows with delays greater than max_delay
deng_df = deng_df[
    (deng_df['Delay'] < max_delay) &
    (deng_df['Delay'] >= 0)
    ]

# Now want to create contingency table for every symptom date for each delay 
deng_delays = pd.crosstab(deng_df['Collection date'], deng_df['Delay'])

# Check for any NAs across all (Delay,Symptom combinations)
(deng_delays.isna().sum() > 0).sum()

# Ensure all days between start_year and end_year are present
all_days = pd.DataFrame({
    "Collection date": pd.date_range(
        start=f"{start_year}-01-01", end=f"{end_year}-12-31")
})
print(all_days)
deng_delays = all_days.merge(deng_delays, on="Collection date", how="left")
deng_delays = deng_delays.fillna(0)

# Rename cols for clarity
deng_delays.columns = [deng_delays.columns[0]] + [f"delay_{col}" for col in deng_delays.columns[1:]]
deng_delays

     Collection date
0         2013-01-01
1         2013-01-02
2         2013-01-03
3         2013-01-04
4         2013-01-05
...              ...
4012      2023-12-27
4013      2023-12-28
4014      2023-12-29
4015      2023-12-30
4016      2023-12-31

[4017 rows x 1 columns]


Unnamed: 0,Collection date,delay_0,delay_1,delay_2,delay_3,delay_4,delay_5,delay_6,delay_7,delay_8,...,delay_30,delay_31,delay_32,delay_33,delay_34,delay_35,delay_36,delay_37,delay_38,delay_39
0,2013-01-01,229.0,592.0,717.0,621.0,230.0,104.0,442.0,268.0,186.0,...,7.0,13.0,9.0,5.0,14.0,13.0,4.0,6.0,5.0,0.0
1,2013-01-02,407.0,640.0,546.0,283.0,169.0,478.0,268.0,252.0,177.0,...,4.0,8.0,13.0,21.0,17.0,10.0,6.0,11.0,1.0,0.0
2,2013-01-03,431.0,686.0,315.0,239.0,578.0,392.0,294.0,222.0,133.0,...,0.0,1.0,32.0,17.0,3.0,7.0,5.0,2.0,0.0,1.0
3,2013-01-04,570.0,412.0,272.0,801.0,523.0,429.0,318.0,206.0,41.0,...,0.0,24.0,24.0,6.0,4.0,4.0,4.0,2.0,3.0,0.0
4,2013-01-05,568.0,368.0,779.0,671.0,538.0,406.0,285.0,72.0,35.0,...,16.0,6.0,16.0,9.0,5.0,1.0,0.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,465.0,979.0,757.0,402.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4013,2023-12-28,574.0,835.0,409.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4014,2023-12-29,503.0,606.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4015,2023-12-30,443.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
base_folder_path = project_dir / "data" / "transformed" / "DENG_delays.csv"
deng_delays.to_csv(base_folder_path, index=False)