In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset

sys.path.append(os.path.abspath('../src'))
project_dir = Path.cwd().parent


In [2]:
sero_all = ["DENV-1", "DENV-2", "DENV-3", "DENV-4"]
p_sero = np.array([0.4, 0.3, 0.2, 0.1])
delays_df = pd.read_csv(project_dir / "data" / "transformed" / "DENG_delays.csv")

delays_df = delays_df.set_index("Collection date")
delays_df.index = pd.to_datetime(delays_df.index)

y_true = delays_df.sum(1)[:-2]
y_true_df = y_true.groupby(y_true.index.to_period("M")).sum()
y_true = np.array(y_true_df)

denv_df = pd.read_csv(project_dir / "data" / "transformed" / "denv_df.csv")

denv_df = denv_df[denv_df['Delay'] < 60]



In [3]:
# If your original denv_df dates are datetime
denv_df['Collection date'] = pd.to_datetime(denv_df['Collection date'])

start_month = pd.to_datetime(delays_df.index.min())
end_month = pd.to_datetime(delays_df.index.max())

# Create a DataFrame of month start dates as datetime (not Period)
dates = pd.DataFrame({
    "Collection date": pd.date_range(start=start_month, end=end_month, freq='MS')
})

df = denv_df[denv_df['Sero'] == "DENV-1"] \
    .groupby(['Sero', 'Collection date', 'Delay']) \
    .size() \
    .reset_index(name='count')

df = df.pivot(index='Collection date', columns='Delay', values='count')

# Ensure df.index is datetime as well
df.index = pd.to_datetime(df.index)

p_delay = np.array(df.fillna(0).mean(0) / df.fillna(0).mean(0).sum())
p_delay

array([0.00124127, 0.0189294 , 0.03827256, 0.04246186, 0.03847944,
       0.03796224, 0.02741143, 0.02063615, 0.01960176, 0.02348073,
       0.01913628, 0.01427463, 0.0160331 , 0.01649858, 0.0157745 ,
       0.02203258, 0.03165244, 0.02192914, 0.02704939, 0.0223429 ,
       0.01794673, 0.02751487, 0.01520559, 0.01013706, 0.01391259,
       0.02963538, 0.03098009, 0.01784329, 0.01742953, 0.01815361,
       0.01820533, 0.01225756, 0.01525731, 0.01417119, 0.01499871,
       0.01530903, 0.01406775, 0.0194466 , 0.01417119, 0.00863719,
       0.01184381, 0.006827  , 0.00713732, 0.0065684 , 0.00806827,
       0.01773985, 0.00796483, 0.00755107, 0.00941298, 0.0063098 ,
       0.02161883, 0.01453323, 0.00470649, 0.00491337, 0.00796483,
       0.00739591, 0.00651668, 0.00884407, 0.00641324, 0.00718904])

In [4]:
p_sero = p_sero[:, np.newaxis, np.newaxis]   # shape (S,1,1)
y_true = y_true[np.newaxis, :, np.newaxis]  # shape (1,T,1)
p_delay = p_delay[np.newaxis, np.newaxis, :]  # shape (1,1,D)

# multiply to get (S,T,D)
sero_tensor = p_sero * y_true * p_delay
sero_tensor = sero_tensor.round()

print(sero_tensor.shape)  # (3,4,3)

(4, 144, 60)


In [5]:

for s,sero in enumerate(sero_all):
    sero_df = pd.DataFrame(sero_tensor[s, :, :])
    sero_df.index = y_true_df.index
    sero_df.to_csv(project_dir / "data" / "model" / "sero_dfs" / f"{sero}.csv", index=True)
    print("Saving sero: ", sero)

Saving sero:  DENV-1
Saving sero:  DENV-2
Saving sero:  DENV-3
Saving sero:  DENV-4
