In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import multiprocessing as mp
import pickle as pkl
import os
import re
from tqdm import tqdm
tqdm.pandas()

# Deep Learning Imports 
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [None]:
kaggle = os.path.exists("/kaggle/input")
if kaggle:
    files = glob("../input/google-symptom-trends-as-of-october-1st-2022/datasets/20??_country_daily_20??_US_daily_symptoms_dataset.csv")
else:
    files = glob("datasets/20??_country_daily_20??_US_daily_symptoms_dataset.csv")
    from EDAModule.RegionVis import generalRegionVisualiztion

dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs, ignore_index=True)
del dfs

In [None]:
# Data Stratification based on regions 
regions = df["sub_region_1"].unique()
regions = np.delete(regions, 0)
dfs = [df[df["sub_region_1"] == region] for region in regions]

# Change data resolution to weekly
def weekly(df):
    # Convert date to pandas datetime object
    df["date"] = pd.to_datetime(df["date"])
    df = df.set_index("date")
    df = df.resample("W").mean()
    df = df.reset_index()
    return df

dfsweekly = [weekly(df) for df in dfs]
del dfs

# Store the weekly dataframes to a pickle seperate pickle files
for i, region in enumerate(regions):
    try:
        os.makedirs(f"./datasets/weekly/{region}")
    except FileExistsError:
        pass
    with open(f"./datasets/weekly/{region}/dataset.pkl", "wb") as f:
        pkl.dump(dfsweekly[i], f)

del dfsweekly


### Set Case: Georgia 

We will be using Georgia as our case study.

In [None]:
f = open("./datasets/weekly/Georgia/dataset.pkl", "rb")
df = pkl.load(f)
symptoms = [col for col in df.columns if 'symptom' in col]

In [None]:
# Plot a missing data Seaborn heatmap fon Georgia dataset

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.show()

Motherfucker, the dataset looks clean and actually better. 

Except for the one symptom and a missing column. With the main dataset chosen from October we get a total of 192 weeks worth of data. 

In [None]:
# Plot a distribution Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms], cmap="viridis", cbar=True)
plt.show()

In [None]:
# Plot a correlation Seaborn heatmap for each symptom dataset for Georgia

ax, fig = plt.subplots(figsize=(20, 10))
sns.heatmap(df[symptoms].corr(), cmap="viridis")
plt.show()

In [None]:
try:
    generalRegionVisualiztion(df, "./datasets/weekly/Georgia/")
except NameError:
    pass

In [None]:
# Missing Data Analysis 

for region in regions:
    df = pkl.load(open(f"./datasets/weekly/{region}/dataset.pkl", "rb"))
    print(f"{region} has {df.isnull().sum().sum()} missing values")

### Missing Data checkpoint

Just saw the missing data heatmaps. Holy fuck, boy this is gonna be fun. 

States with the most missing data: Alaska, Delaware (I though Biden was from here), District of Columbia, Hawaii, Idaho, Maine, Mississippi, Montana, Nebraska, New Hampshire, New Mexico, North Dakota, Rhode Island, South Dakota, Utah, Vermont, West Virginia, Wyoming.

States with the bearable missing data: Alabama, Arkansas, Connecticut, Iowa, Kansas, Kentucky, Lousiana, Minnesota, Missouri, Nevada, Oregon, Oklahoma, South Carolina, Wisconsin.

With this consensus, the best way would be to train a model which have a better dataset like Florida, California, Georgia, Texas, New York and others which have a better dataset.

The popularity of the term would be conserved even if the differential privacy threshold doesn't hold. 
Using STRATS to impute the missing data from the other states would be a good idea.


In [None]:
# Impute 0 to missing data

# for region in regions:
#     f = open(f"./datasets/weekly/{region}/dataset.pkl", "rb")
#     df = pkl.load(f)
#     df = df.fillna(0)
#     with open(f"./datasets/weekly/{region}/dataset.pkl", "wb") as f:
#         pkl.dump(df, f)

# Train a model for California, New York, Texas, Florida, Georgia, Illinois, Indiana, Maryland, Massachusetts, Michigan, New Jersey, North Carolina, Ohio, Pennsylvania, Tennessee, Virginia, Washington, Wisconsin to impute missing data for Arkansas

sym = 'symptom:Hemolysis'

# Load training data 
trainingRegions = ["California", "New York", "Texas", "Florida", "Georgia", "Illinois", "Indiana", "Maryland", "Massachusetts", "Michigan", "New Jersey", "North Carolina", "Ohio", "Pennsylvania", "Tennessee", "Virginia", "Washington", "Wisconsin"]
trainingData = [pkl.load(open(f"./datasets/weekly/{region}/dataset.pkl", "rb")).loc[:,['date',sym]].set_index('date') for region in trainingRegions]
trainingData = pd.concat(trainingData, axis = 1, ignore_index = False)
trainingData.columns = trainingRegions
trainingData = trainingData.transpose()



# Load testing data
testingData = pkl.load(open(f"./datasets/weekly/Arkansas/dataset.pkl", "rb")).loc[:,['date',sym]]
masking_rate = testingData[sym].isnull().sum()/len(testingData)


# Plot the training and testing data
ax, fig = plt.subplots(figsize=(20, 10))
for region in trainingRegions:
    sns.lineplot(data=trainingData.loc[region], label=region)
# sns.lineplot(data=testingData, x="date", y=sym, label="Testing Data", hue = testingData[sym].isna().cumsum(), palette=["orange"]*sum(testingData[sym].isna()) + ["blue"]*(len(testingData) - sum(testingData[sym].isna())), legend=False, markers=True)
plt.show()

In [None]:
# Making basic encoder and decoder model architecture 

class AE(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, hidden_size_3, hidden_size_4, latent_size):
        super(AE, self).__init__()
        self.input_size = input_size
        self.hidden_size_1 = hidden_size_1
        self.hidden_size_2 = hidden_size_2
        self.hidden_size_3 = hidden_size_3
        self.hidden_size_4 = hidden_size_4
        self.latent_size = latent_size
        self.encoder = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size_1),
            nn.ReLU(),
            nn.Linear(self.hidden_size_1, self.hidden_size_2),
            nn.ReLU(),
            nn.Linear(self.hidden_size_2, self.hidden_size_3),
            nn.ReLU(), 
            nn.Linear(self.hidden_size_3, self.hidden_size_4),
            nn.ReLU(),
            nn.Linear(self.hidden_size_4, self.latent_size)
        )
        self.encoder_mu = nn.Linear(self.latent_size, self.latent_size)
        self.encoder_logvar = nn.Linear(self.latent_size, self.latent_size)
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_size, self.hidden_size_4),
            nn.ReLU(),
            nn.Linear(self.hidden_size_4, self.hidden_size_3),
            nn.ReLU(),
            nn.Linear(self.hidden_size_3, self.hidden_size_2),
            nn.ReLU(),
            nn.Linear(self.hidden_size_2, self.hidden_size_1),
            nn.ReLU(),
            nn.Linear(self.hidden_size_1, self.input_size)
        )
    def forward(self, x):
        x = self.encoder(x)
        mu = self.encoder_mu(x)
        logvar = self.encoder_logvar(x)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z)
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)
    
    def supervised_forward(self, x):
        x = self.encoder(x)
        mu = self.encoder_mu(x)
        logvar = self.encoder_logvar(x)
        z = self.reparameterize(mu, logvar)
        return mu, logvar, z
    
    def unsupervised_forward(self, x):
        x = self.encoder(x)
        mu = self.encoder_mu(x)
        logvar = self.encoder_logvar(x)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z)

In [None]:
# Making a simple Non-linear Autoencoder model
input_size = trainingData.shape[1]

class FClayer(nn.Module):
    def __init__(self, input_size, output_size, activation = nn.ReLU()):
        super(FClayer, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        self.weight = nn.Parameter(torch.zeros(input_size, output_size))

        self.bias = nn.Parameter(torch.ones(1, output_size))

        torch.nn.init.xavier_uniform_(self.weight)
        torch.nn.init.xavier_uniform_(self.bias)

    def forward(self, x):
        output = torch.matmul(x,self.weight) + self.bias
        if self.activation:
            output = self.activation(output)
        return output

# Layers 

hidden_layer_1 = FClayer(input_size = input_size, output_size = 128, activation = nn.ReLU())
hidden_layer_2 = FClayer(input_size = 128, output_size = 64, activation = None)
hidden_layer_3 = FClayer(input_size = 64, output_size = 32, activation = nn.ReLU())
hidden_layer_4 = FClayer(input_size = 32, output_size = 16, activation = None)
hidden_layer_5 = FClayer(input_size = 16, output_size = 8, activation = nn.ReLU())
hidden_layer_6 = FClayer(input_size = 8, output_size = 4, activation = None)
hidden_layer_7 = FClayer(input_size = 4, output_size = 8, activation = nn.ReLU())
hidden_layer_8 = FClayer(input_size = 8, output_size = 16, activation = None)
hidden_layer_9 = FClayer(input_size = 16, output_size = 32, activation = nn.ReLU())
hidden_layer_10 = FClayer(input_size = 32, output_size = 64, activation = None)
hidden_layer_11 = FClayer(input_size = 64, output_size = 128, activation = nn.ReLU())
hidden_layer_12 = FClayer(input_size = 128, output_size = input_size, activation = None)

In [None]:
# Source: https://github.com/shobrook/sequitur/blob/master/sequitur/models/lstm_ae.py

class EncoderRNN(nn.Module):
    def __init__(self, input_size, latent_size, hidden_size, hidden_activation, latent_activation):
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size
        self.hidden_size = hidden_size

        self.hidden_activation = hidden_activation
        self.latent_activation = latent_activation

        layer_size = [input_size] + hidden_size + [latent_size]
        self.num_layers = len([input_size] + hidden_size + [latent_size]) - 1
        self.layers = nn.ModuleList()

        for i in range(self.num_layers):
            self.layers.append(nn.LSTM(layer_size[i], layer_size[i+1], num_layers = 1, batch_first=True))
        
    def forward(self, x):
        x = x.unsqueeze(0)
        for i, layer in enumerate(self.layers):
            x, (h_n, c_n) = layer(x)

            if self.hidden_activation and i < self.num_layers - 1:
                x = self.hidden_activation(x)
            elif self.latent_activation and i == self.num_layers - 1:
                return self.latent_activation(h_n).squeeze()
        
        return h_n.squeeze()



        
class DecoderRNN(nn.Module):
    def __init__(self, input_size, latent_size, hidden_size, hidden_activation, latent_activation):
        super(DecoderRNN, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size
        self.hidden_size = hidden_size

        self.hidden_activation = hidden_activation
        self.latent_activation = latent_activation

        layer_size = [input_size] + hidden_size + [hidden_size[-1]]
        self.num_layers = len(layer_size) - 1
        self.layers = nn.ModuleList()

        for i in range(self.num_layers):
            self.layers.append(nn.LSTM(layer_size[i], layer_size[i+1], num_layers = 1, batch_first=True))
        
        self.dense_matrix = nn.Parameter(torch.rand((layer_size[-1], latent_size), dtype = torch.float), requires_grad = True)

    def forward(self, x, seq_len):
        x = x.repeat(seq_len, 1).unsqueeze(0)
        for i, layer in enumerate(self.layers):
            x, (h_n, c_n) = layer(x)

            if self.hidden_activation and i < self.num_layers - 1:
                x = self.hidden_activation(x)
        
        return torch.mm(x.squeeze(), self.dense_matrix)

        

class LSTM_AE(nn.Module):
    def __init__(self, input_size, latent_size, hidden_size = [], hidden_activation = nn.Sigmoid(), latent_activation = nn.Tanh()):
        super(LSTM_AE, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size
        self.hidden_size = hidden_size

        self.hidden_activation = hidden_activation
        self.latent_activation = latent_activation

        self.encoder = EncoderRNN(input_size, latent_size, hidden_size, hidden_activation, latent_activation)
        self.decoder = DecoderRNN(latent_size, input_size, hidden_size[::-1], hidden_activation, latent_activation)

    def forward(self, x):
        seq_len = x.shape[0]
        x = self.encoder(x)
        x = self.decoder(x, seq_len)
        return x


In [None]:
# Model Initialization

input_size = trainingData.shape[1]
hidden_size_1, hidden_size_2, hidden_size_3, hidden_size_4 = 128, 64, 32, 16
latent_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# AElinear = AE(input_size, hidden_size_1, hidden_size_2, hidden_size_3, hidden_size_4, latent_size).to(device)
# AEFC = nn.Sequential(*[hidden_layer_1, hidden_layer_2, hidden_layer_3, hidden_layer_4, hidden_layer_5, hidden_layer_6, hidden_layer_7, hidden_layer_8, hidden_layer_9, hidden_layer_10, hidden_layer_10, hidden_layer_11, hidden_layer_12])
AE_LSTM = LSTM_AE(input_size, latent_size, [hidden_size_1, hidden_size_2, hidden_size_3, hidden_size_4]).to(device)
lr = 1e-3
model = AE_LSTM.copy()
criterion = nn.MSELoss(size_average = False)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

In [None]:
# Training the model
num_epochs = 1000
for epoch in tqdm(range(num_epochs)):
    model.train()
    # Randomize the order of training data
    random_order = np.random.permutation(trainingData.shape[0])
    trainingData = trainingData.iloc[random_order]
    # Reduces learning rate every 50 epochs
    if not epoch % 50:
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr * (0.993 ** epoch)
    for region in trainingRegions:
        data = torch.tensor(trainingData.loc[region].values).float()
        data = data.to(device)
        # Mask some values as nan
        # mask = torch.rand_like(data) < masking_rate

        # Mask some values as nan and store them in a new tensor
        mask = torch.rand_like(data) < masking_rate
        masked_data = data.clone()
        masked_data[mask] = float('nan')

        # Forward pass
        output = model(masked_data)
        # output = model(mask.float() * data)
        # output = model(data)

        # Calculate the loss on the indexes where the values are not nan
        loss = criterion(output[~mask], data[~mask])


        loss = criterion(output, data)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    if (epoch + 1) % 50 == 0:
        print(f"Epoch :{epoch + 1}, Loss: {loss.item():.4f}")


In [None]:
# Plot the predicted and actual data
model.eval()
output = model(torch.tensor(testingData[sym].transpose().fillna(0).values).float().to(device))
output = output.detach().numpy()

output.size

testingData['predicted'] = output

# Substituting non nan values with actual values
testingData.loc[~testingData[sym].isna(), 'predicted'] = testingData.loc[~testingData[sym].isna(), sym]

testingData[sym].fillna(value = testingData[sym].mean(), inplace=True)

ax, fig = plt.subplots(figsize=(20, 10))
sns.lineplot(data=testingData, x="date", y=sym, label="Actual Data", legend=False, markers=True)
sns.lineplot(data=testingData, x="date", y='predicted', label="Predicted Data", legend=False, markers=True)
plt.show()

from sklearn.metrics import mean_squared_error
testingData = testingData.dropna()
print(mean_squared_error(testingData[sym], testingData['predicted']))

### Latent Learning Two Cents

There is a lot of considerations we should keep in mind here.

1. Lack of training data 
2. Handling of masking and what we need to optimize
    > Elaboration on this: We need to predict missing data while my model is taking in entire masked data and then predicting actual data. 
    > Need help on this as to how this will work. 
3. I have used Linear Autoencoder and Fully Connected Autoencoder, which didn't work as expected. Fully Connected Autoencoder has better accuracy but than can be attributed to the fact that it had more layers. The gain in accuracy was marginal. 
4. Need to implement RNN techniques and most of it involves looking at the Attention Layer and understanding how it would help us. 
5. Masking is done on 0. We can preprocess the the data with the masking rate varying and then imputing it with the ffill and rfill methods. 

The imputation is 2 fronted problem now with the comparision and using LSTM to predict the missing data.

### Trend Visualization 

Lets see if the data joining worked and if we need to make adjustments for that in the early steps.

In [None]:
# Plot cough, fever, hypoxemia symptoms for Georgia dataset
searchsymptoms = ["cough", "fever", "hypoxemia"]

# Load Georgia dataset
f = open("./datasets/weekly/Georgia/dataset.pkl", "rb")
df = pkl.load(f)

# Find columns which have cough, fever, and sore throat symptoms using regex search with ignoring case
symptoms = [col for col in df.columns if any(re.search(search, col, re.IGNORECASE) for search in searchsymptoms)]

# Plot the symptoms
ax, fig = plt.subplots(figsize=(20, 10))
for symptom in symptoms:
    plt.plot(df["date"], df[symptom], label=symptom)
plt.show()

## Canonical Correlation Analysis

### CCA

CCA is a multivariate analysis technique that is used to find linear relationships between two sets of variables. It is a generalization of the Pearson correlation coefficient, which is used to find the linear relationship between two sets of variables.


In [None]:
if kaggle:
    df = pd.read_csv("../input/cdc-covid-tracker-dataset-for-us/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
else:
    df = pd.read_csv("./datasets/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")

# Stratify the data by state
dfs = [df[df['state'] == region] for region in df['state'].unique()]

for df in dfs:
    df['date'] = pd.to_datetime(df['submission_date'])
    # Aggregate the data by week
    df = df.resample('W', on='date').sum()
    # Select the columns we want 
    df.columns


In [None]:
# Correlation analysis of the symptoms 