In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import multiprocessing as mp
import pickle as pkl
import os
from EDAModule.RegionVis import generalRegionVisualiztion

In [3]:
if os.path.exists("/kaggle/input"):
    files = glob("../input/google-symptom-trends-as-of-october-1st-2022/datasets/*.csv")
else:
    files = glob("datasets/*.csv")

dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs, ignore_index=True)
del dfs

In [None]:
# Data Stratification based on regions 
regions = df["sub_region_1"].unique()
dfs = [df[df["sub_region_1"] == region] for region in regions]

# Change data resolution to weekly
def weekly(df):
    # Convert date to pandas datetime object
    df["date"] = pd.to_datetime(df["date"])
    df = df.set_index("date")
    df = df.resample("W").mean()
    df = df.reset_index()
    return df

dfsweekly = [weekly(df) for df in dfs]
del dfs

# Store the weekly dataframes to a pickle seperate pickle files
for i, region in enumerate(regions):
    try:
        os.mkdir("./datasets/weekly/{region}")
    except FileExistsError:
        pass
    with open(f"./datasets/weekly/{region}/dataset.pkl", "wb") as f:
        pkl.dump(dfsweekly[i], f)

del dfsweekly


### Set Case: Georgia 

We will be using Georgia as our case study.

In [None]:
f = open("./datasets/weekly/Georgia/dataset.pkl", "rb")
df = pkl.load(f)
symptoms = [col for col in df.columns if 'symptom' in col]

In [None]:
# Plot a missing data Seaborn heatmap fon Georgia dataset

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.show()

Motherfucker, the dataset looks clean and actually better. 

Except for the one symptom and a missing column. With the main dataset chosen from October we get a total of 192 weeks worth of data. 

In [None]:
# Plot a distribution Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms], cmap="viridis", cbar=True)
plt.show()

In [None]:
# Plot a correlation Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms].corr(), cmap="viridis")
plt.show()

In [None]:
generalRegionVisualiztion(df, "./datasets/weekly/Georgia/")