In [1]:
import pandas as pd
from pathlib import Path
import sys
import os
import warnings

start_year = 2012
end_year = 2023
max_delay = 40

sys.path.append(os.path.abspath('../src'))
project_dir = project_dir = Path.cwd().parent

base_folder_path = project_dir / "data" / "raw" / "counts"
files = [f for f in os.listdir(base_folder_path) if f.endswith('.csv')]

df_list = []
for file in files:
    print(f"Loading: {file}")
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
        df = pd.read_csv(os.path.join(base_folder_path, file))
        df = pd.read_csv(os.path.join(base_folder_path, file))
    df = df[['DT_SIN_PRI', 'DT_NOTIFIC']].dropna()
    df.columns = ['Collection date' ,'Submission date']
    df = df.apply(pd.to_datetime, errors='coerce')
   
    
    df = df[
    (df["Collection date"].dt.year >= start_year) & 
    (df["Collection date"].dt.year <= end_year)
    ].copy()
    df = df.sort_values(by='Collection date')
    df['Delay'] = (df['Submission date'] - df['Collection date']).dt.days
    df_list.append(df)

deng_df = pd.concat(df_list, ignore_index=True)
deng_df.sort_values("Collection date")
print(deng_df)





Loading: DENGBR21.csv
Loading: DENGBR20.csv
Loading: DENGBR22.csv
Loading: DENGBR23.csv
Loading: DENGBR18.csv
Loading: DENGBR19.csv
Loading: DENGBR14.csv
Loading: DENGBR15.csv
Loading: DENGBR17.csv
Loading: DENGBR16.csv
Loading: DENGBR12.csv
Loading: DENGBR13.csv
         Collection date Submission date  Delay
0             2021-01-03      2021-02-05     33
1             2021-01-03      2021-01-20     17
2             2021-01-03      2021-01-06      3
3             2021-01-03      2021-01-13     10
4             2021-01-03      2021-01-05      2
...                  ...             ...    ...
17278703      2013-12-28      2013-12-28      0
17278704      2013-12-28      2013-12-28      0
17278705      2013-12-28      2013-12-28      0
17278706      2013-12-28      2013-12-28      0
17278707      2013-12-28      2013-12-28      0

[17278708 rows x 3 columns]


In [2]:
# Filter out rows with delays greater than max_delay and nonsensical data
deng_df = deng_df[
    (deng_df['Delay'] < max_delay) &
    (deng_df['Delay'] >= 0)
    ]

# Now want to create contingency table for every symptom date for each delay 
deng_delays = pd.crosstab(deng_df['Collection date'], deng_df['Delay'])

# Check for any NAs across all (Delay,Symptom combinations)
(deng_delays.isna().sum() > 0).sum()

# Ensure all days between start_year and end_year are present
all_days = pd.DataFrame({
    "Collection date": pd.date_range(
        start=f"{start_year}-01-01", end=f"{end_year}-12-31")
})
print(all_days)
deng_delays = all_days.merge(deng_delays, on="Collection date", how="left")
deng_delays = deng_delays.fillna(0)

# Rename cols for clarity
deng_delays.columns = [deng_delays.columns[0]] + [f"delay_{col}" for col in deng_delays.columns[1:]]
deng_delays

     Collection date
0         2012-01-01
1         2012-01-02
2         2012-01-03
3         2012-01-04
4         2012-01-05
...              ...
4378      2023-12-27
4379      2023-12-28
4380      2023-12-29
4381      2023-12-30
4382      2023-12-31

[4383 rows x 1 columns]


Unnamed: 0,Collection date,delay_0,delay_1,delay_2,delay_3,delay_4,delay_5,delay_6,delay_7,delay_8,...,delay_30,delay_31,delay_32,delay_33,delay_34,delay_35,delay_36,delay_37,delay_38,delay_39
0,2012-01-01,129.0,268.0,318.0,341.0,278.0,207.0,47.0,33.0,126.0,...,8.0,6.0,10.0,4.0,1.0,1.0,5.0,5.0,2.0,2.0
1,2012-01-02,243.0,406.0,370.0,394.0,270.0,60.0,22.0,238.0,114.0,...,0.0,6.0,9.0,1.0,1.0,7.0,6.0,4.0,2.0,3.0
2,2012-01-03,295.0,446.0,446.0,333.0,82.0,49.0,220.0,135.0,90.0,...,3.0,4.0,1.0,1.0,4.0,7.0,1.0,0.0,2.0,0.0
3,2012-01-04,282.0,520.0,375.0,144.0,68.0,231.0,158.0,120.0,77.0,...,0.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0
4,2012-01-05,302.0,439.0,147.0,102.0,311.0,186.0,144.0,109.0,93.0,...,0.0,1.0,13.0,2.0,5.0,2.0,5.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,2023-12-27,465.0,979.0,757.0,402.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4379,2023-12-28,574.0,835.0,409.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4380,2023-12-29,503.0,606.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4381,2023-12-30,443.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
base_folder_path = project_dir / "data" / "transformed" / "DENG_delays.csv"
deng_delays.to_csv(base_folder_path, index=False)