In [5]:
import numpy as np
import pandas as pd

import git
import sys
import os

sys.path.append(git.Repo(".", search_parent_directories=True).working_tree_dir)
sys.path.append(os.abs.path(os.path.join(os.path.dirname(__file__), "..")))

from modules.utils import SYMPTOM_NAMES, SYMPTOM_COLUMNS

In [14]:
rsv_df = pd.read_csv("../datasets/combined.csv")

In [16]:
media_df = pd.read_csv("../datasets/media_count_ratio_all_2021.csv")

In [17]:
# Data from https://www.cdc.gov/mmwr/volumes/69/wr/pdfs/mm6928a2-H.pdf
# For anosmia, ageusia, nasal congestion, and nausea, I had to just look at the chart they gave (they don't provide numerical data for all symptoms)
# Weights come from percent of people reporting each of the symptoms
symptom_weights = {
    "symptom:fever": 0.8,
    "symptom:chills": 0.63,
    "symptom:cough": 0.84,
    "symptom:shortness of breath": 0.57,
    "symptom:sore throat": 0.40,
    "symptom:headache": 0.59,
    "symptom:fatigue": 0.62,
    "symptom:muscle weakness": 0.63,
    "symptom:anosmia": 0.20,
    "symptom:ageusia": 0.20,
    "symptom:nasal congestion": 0.10,
    "symptom:nausea": 0.34,
    "symptom:vomiting": 0.13,
    "symptom:diarrhea": 0.38,
}

In [18]:
# Methodology is to simply multiply RSV by corresponding symptom weight
combined_rsvs = []
for _, row in rsv_df.iterrows():
    combined_rsv = 0
    for symptom in SYMPTOM_COLUMNS:
        combined_rsv += row[symptom] * symptom_weights[symptom]
    combined_rsvs.append(combined_rsv)
rsv_df["symptom:combined"] = combined_rsvs

In [19]:
rsv_df

Unnamed: 0,date,symptom:ageusia,symptom:anosmia,symptom:chills,symptom:cough,symptom:diarrhea,symptom:fatigue,symptom:fever,symptom:headache,symptom:muscle weakness,...,symptom:nausea,symptom:shortness of breath,symptom:sore throat,symptom:vomiting,daily_new_positives,cumulative_positives,num_daily_tests,cumulative_tests,test_positivity_rate,symptom:combined
0,2021-01-01,0.32,0.37,0.30,3.75,3.50,3.72,3.49,4.49,0.16,...,2.12,0.65,1.40,2.85,15074,1005785,202446,25706759,0.0745,14.8431
1,2021-01-02,0.36,0.42,0.30,3.90,3.38,3.95,3.65,4.08,0.16,...,1.90,0.69,1.43,2.40,11368,1017153,142345,25849104,0.0799,14.8717
2,2021-01-03,0.34,0.46,0.29,3.88,3.39,4.03,3.69,3.83,0.17,...,1.84,0.70,1.40,2.37,11209,1028362,134360,25983464,0.0834,14.7662
3,2021-01-04,0.28,0.34,0.31,3.93,3.37,4.34,4.00,3.93,0.22,...,1.85,0.77,1.42,2.27,12666,1041028,152402,26135866,0.0831,15.3282
4,2021-01-05,0.27,0.32,0.33,3.98,3.38,4.42,4.11,4.31,0.22,...,1.81,0.74,1.40,2.14,16648,1057676,197816,26333682,0.0842,15.6898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2021-12-27,0.30,0.41,0.59,11.15,4.21,4.14,8.29,4.26,0.19,...,2.26,0.80,4.22,3.26,40780,3251712,210996,87447878,0.1933,27.0476
361,2021-12-28,0.31,0.43,0.63,11.92,4.18,4.41,9.06,4.57,0.21,...,2.27,0.82,4.36,3.19,67090,3318802,362594,87810472,0.1850,28.7918
362,2021-12-29,0.33,0.43,0.58,11.88,4.15,4.29,8.78,4.48,0.20,...,2.29,0.83,4.36,3.15,74207,3393009,336469,88146941,0.2205,28.3798
363,2021-12-30,0.34,0.43,0.53,11.46,4.12,4.11,8.28,4.46,0.20,...,2.25,0.83,4.22,3.06,76555,3469564,339853,88486794,0.2253,27.3794


In [20]:
# Methodology is to simply multiply media count ratio by corresponding symptom weight
combined_media = []
for _, row in media_df.iterrows():
    combined_rsv = 0
    for symptom in SYMPTOM_COLUMNS:
        combined_rsv += row[symptom] * symptom_weights[symptom]
    combined_media.append(combined_rsv)
media_df["symptom:combined"] = combined_media

In [21]:
media_df["daily_new_positives"] = rsv_df["daily_new_positives"]
media_df

Unnamed: 0,date,symptom:ageusia,symptom:anosmia,symptom:chills,symptom:cough,symptom:diarrhea,symptom:fatigue,symptom:fever,symptom:headache,symptom:loss of smell,symptom:loss of taste,symptom:muscle weakness,symptom:nasal congestion,symptom:nausea,symptom:shortness of breath,symptom:sore throat,symptom:vomiting,symptom:combined,daily_new_positives
0,2021-01-01,0.0,0.000000,0.001309,0.002618,0.001745,0.006981,0.010471,0.002182,0.000000,0.000873,0.0,0.0,0.000000,0.002182,0.000436,0.002618,0.019437,15074
1,2021-01-02,0.0,0.000604,0.000000,0.004831,0.000604,0.019928,0.003019,0.000000,0.001208,0.000604,0.0,0.0,0.000000,0.000604,0.000000,0.001208,0.019680,11368
2,2021-01-03,0.0,0.000000,0.002271,0.002839,0.000000,0.005679,0.007382,0.002839,0.000568,0.001136,0.0,0.0,0.000000,0.001136,0.000000,0.000000,0.015565,11209
3,2021-01-04,0.0,0.000000,0.002441,0.002197,0.000244,0.004150,0.005371,0.002441,0.000000,0.000488,0.0,0.0,0.001465,0.001465,0.000488,0.000244,0.013347,12666
4,2021-01-05,0.0,0.000000,0.001320,0.001320,0.000660,0.007038,0.004838,0.001539,0.001539,0.000660,0.0,0.0,0.001539,0.001320,0.001320,0.000880,0.013250,16648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2021-12-27,0.0,0.000000,0.039474,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.024868,40780
361,2021-12-28,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,67090
362,2021-12-29,0.0,0.000000,0.000000,0.025641,0.000000,0.000000,0.025641,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.025641,0.000000,0.000000,0.056667,74207
363,2021-12-30,0.0,0.000000,0.000000,0.000000,0.007143,0.007143,0.028571,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.007143,0.007143,0.033786,76555


In [23]:
rsv_df.to_csv("../datasets/combined.csv", index=False)
media_df.to_csv("../datasets/media_count_ratio_all_2021.csv", index=False)