In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import multiprocessing as mp
import pickle as pkl
import os
import re

In [3]:
kaggle = os.path.exists("/kaggle/input")
if kaggle:
    files = glob("../input/google-symptom-trends-as-of-october-1st-2022/datasets/20??_country_daily_20??_US_daily_symptoms_dataset.csv")
else:
    files = glob("datasets/20??_country_daily_20??_US_daily_symptoms_dataset.csv")
    from EDAModule.RegionVis import generalRegionVisualiztion

dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs, ignore_index=True)
del dfs

In [None]:
# Data Stratification based on regions 
regions = df["sub_region_1"].unique()
dfs = [df[df["sub_region_1"] == region] for region in regions]

# Change data resolution to weekly
def weekly(df):
    # Convert date to pandas datetime object
    df["date"] = pd.to_datetime(df["date"])
    df = df.set_index("date")
    df = df.resample("W").mean()
    df = df.reset_index()
    return df

dfsweekly = [weekly(df) for df in dfs]
del dfs

# Store the weekly dataframes to a pickle seperate pickle files
for i, region in enumerate(regions):
    try:
        os.makedirs("./datasets/weekly/{region}")
    except FileExistsError:
        pass
    with open(f"./datasets/weekly/{region}/dataset.pkl", "wb") as f:
        pkl.dump(dfsweekly[i], f)

del dfsweekly


### Set Case: Georgia 

We will be using Georgia as our case study.

In [None]:
f = open("./datasets/weekly/Georgia/dataset.pkl", "rb")
df = pkl.load(f)
symptoms = [col for col in df.columns if 'symptom' in col]

In [None]:
# Plot a missing data Seaborn heatmap fon Georgia dataset

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.show()

Motherfucker, the dataset looks clean and actually better. 

Except for the one symptom and a missing column. With the main dataset chosen from October we get a total of 192 weeks worth of data. 

In [None]:
# Plot a distribution Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms], cmap="viridis", cbar=True)
plt.show()

In [None]:
# Plot a correlation Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms].corr(), cmap="viridis")
plt.show()

In [None]:
try:
    generalRegionVisualiztion(df, "./datasets/weekly/Georgia/")
except NameError:
    pass

### Missing Data checkpoint

Just saw the missing data heatmaps. Holy fuck, boy this is gonna be fun. 

States with the most missing data: Alaska, Delaware (I though Biden was from here), District of Columbia, Hawaii, Idaho, Maine, Mississippi, Montana, Nebraska, New Hampshire, New Mexico, North Dakota, Rhode Island, South Dakota, Utah, Vermont, West Virginia, Wyoming.

States with the bearable missing data: Alabama, Arkansas, Connecticut, Iowa, Kansas, Kentucky, Lousiana, Minnesota, Missouri, Nevada, Oregon, Oklahoma, South Carolina, Wisconsin.

#### With this consensus, the best way would be to train a model which have a better dataset like Florida, California, Georgia, Texas, New York and others which have a better dataset.

The popularity of the term would be conserved even if the differential privacy threshold doesn't hold. 

##### Using EDDI from Project-Azua to check the missing data 


In [None]:
# Impute 0 to missing data

for region in regions:
    f = open(f"./datasets/weekly/{region}/dataset.pkl", "rb")
    df = pkl.load(f)
    df = df.fillna(0)
    with open(f"./datasets/weekly/{region}/dataset.pkl", "wb") as f:
        pkl.dump(df, f)

### Trend Visualization 

Lets see if the data joining worked and if we need to make adjustments for that in the early steps.

In [None]:
# Plot cough, fever, hypoxemia symptoms for Georgia dataset
searchsymptoms = ["cough", "fever", "hypoxemia"]

# Load Georgia dataset
f = open("./datasets/weekly/Georgia/dataset.pkl", "rb")
df = pkl.load(f)

# Find columns which have cough, fever, and sore throat symptoms using regex search with ignoring case
symptoms = [col for col in df.columns if any(re.search(search, col, re.IGNORECASE) for search in searchsymptoms)]

# Plot the symptoms
ax, fig = plt.subplots(figsize=(20, 10))
for symptom in symptoms:
    plt.plot(df["date"], df[symptom], label=symptom)
plt.show()

## Canonical Correlation Analysis

### CCA

CCA is a multivariate analysis technique that is used to find linear relationships between two sets of variables. It is a generalization of the Pearson correlation coefficient, which is used to find the linear relationship between two sets of variables.


In [None]:
if kaggle:
    df = pd.read_csv("../input/cdc-covid-tracker-dataset-for-us/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
else:
    df = pd.read_csv("./datasets/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")

# Stratify the data by state
dfs = [df[df['state'] == region] for region in df['state'].unique()]

for df in dfs:
    df['date'] = pd.to_datetime(df['submission_date'])
    # Aggregate the data by week
    df = df.resample('W', on='date').sum()
    # Select the columns we want 
    df.columns


In [None]:
# Correlation analysis of the symptoms 