In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from EDAModule.Initialize import *
from EDAModule.State import State
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
rng = np.random.default_rng(789)

In [3]:
# regions = initialize()

In [4]:
# states = [State(region) for region in regions]

In [None]:
# # Prepping the dataset
# from multiprocessing import Process

# def task(state):
#     state.correlation_analysis()
#     # Make the symptom column a tuple with the symptom name and state name
#     # state.results['symptom'] = state.results['symptom'].apply(lambda x: (x, state.name))
#     # Easier to have two columns rather than 
#     state.results['state'] = state.name

# processes = []
# for state in states:
#     processes.append(Process(target=task, args=(state,)))

# for process in processes:
#     process.start()

# for process in processes:
#     process.join()

In [None]:
# # Some how the results are not being saved to the state object
# # So we have to load them from the pickle file
# for state in states:
#     state.results = pd.read_pickle(f"datasets/features/{state.name}/correlation_features.pkl")
#     state.results['state'] = state.name

# # Concatenate the results
# results = pd.concat([state.results for state in states], ignore_index=True)
# pkl.dump(results, open("datasets/allstates_correlation_features.pkl", "wb"))

In [None]:
# results = pkl.load(open("datasets/allstates_correlation_features.pkl", "rb"))
# results.to_csv("datasets/allstates_correlation_features.csv", index=False)

In [None]:
results = pd.read_csv('datasets/allstates_correlation_features.csv')

# The idea is to see if we have any conservation among features from different symptoms and regions
backup = []
features = ['grangerCasualityFtest', 'abscorrelation', 'ccfauc', 'ccf_lag1','mmcorrelation', 'time_lag']

# Normalize columns to be between 0 and 1
results[features] = results[features].apply(lambda x: (x - x.mean()) / (x.std()))

# Remove the symptoms which have a correlation of less than 0
mask = results[results['abscorrelation'] < 0]['symptom'].value_counts() >= 50
remove_symptoms = mask[mask == True].index

# Remove the symptoms which have granger causality Pval of greater than 0.05
mask = results[results['grangerCausalityPVal'] > 0.05]['symptom'].value_counts() >= 50
remove_symptoms = remove_symptoms.append(mask[mask == True].index)

# Remove the symptoms
results = results[~results['symptom'].isin(remove_symptoms)]

# Reset the index
results = results.reset_index(drop=True)

In [None]:
# For each feature we can make a 2D heatmap plot of the feature vs the symptom and the region
for feature in [features]:

    # Create a pivot table
    pivoted = results.pivot_table(index='symptom', columns='state', values=feature)

    # Sort by mean of the feature along states
    pivoted = pivoted.reindex(pivoted.mean(axis=1).sort_values(ascending=False).index)

    # Sort the columns by mean of the feature along symptoms
    # pivoted = pivoted.reindex(pivoted.mean(axis=0).sort_values(ascending=False).index, axis=1)

    plt.figure(figsize=(10, 10))
    sns.heatmap(pivoted, cmap='viridis')
    plt.title(feature)
    plt.show()

Ok there is a lot to deal with. According to the heatmaps we can kind of see the data's inherent nature. 
Based on that a few useful observations:
1. `grangerCausalityPVal` is making it nuts and for clustering I feel should be inherently removed. We need to explore what other metrics I can add.
2. `mmcorrelation`, `ccf_lag1`, `ccfauc` the distribution is kind of based on the regions. This is a red flag cause ideally the symptoms should be the causal effect and their correlation with the new cases is based on the region. Although not all is lost, it does have some conservation on the symptoms. Also there are visible symptoms which can be clustered together which are not performing at all. All three do this with varying degrees of success.
3. `abscorrelation` the distibution is only based on the symptoms. That has to do with the normalizing technique applied in the correlation function. I wish there was a way to give more weights to this metric in the clustering but I don't know how will I ever do that. 
4. `time_lag` is absolute garbage. It has patterns of conservation with the symptoms but then again it's a metric from the cross-correlation function. Maybe we can include this.
5. `grangerCausalityFtest` was a better option to take to measure the causality. It has a bad distribution with very big outliers and it's not based on the regions.

In [None]:
# Scatter_matrix from plotly
results['grangerselected'] = results['grangerCausalityPVal'].apply(lambda x: -np.log(x))

fig = px.scatter_matrix(results, dimensions=['x','y'], color='grangerselected', height=800, width=800)
fig.show()

In [7]:
# results = pd.read_csv("datasets/allstates_correlation_features.csv")
# df = {'state': [], 'grangersymptoms':[], 'correlationsymptoms':[]}
# number_of_symptoms = 20
# for region in results['state'].unique():
#     df['state'].append(region)
#     df['grangersymptoms'].append(results[results['state'] == region].sort_values(by='grangerCausalityPVal', ascending=True).head(number_of_symptoms)['symptom'].values)
#     df['correlationsymptoms'].append(results[results['state'] == region].sort_values(by='abscorrelation',  ascending=True).head(number_of_symptoms)['symptom'].values)

# df = pd.DataFrame(df)
# df.head()

Unnamed: 0,state,grangersymptoms,correlationsymptoms
0,AL,"[symptom:Eye pain, symptom:Cardiac arrest, sym...","[symptom:Rectal pain, symptom:Uterine contract..."
1,AK,"[symptom:Low-grade fever, symptom:Angina pecto...","[symptom:Photodermatitis, symptom:Shallow brea..."
2,AZ,"[symptom:Eye pain, symptom:Hypoxemia, symptom:...","[symptom:Angular cheilitis, symptom:Amenorrhea..."
3,AR,"[symptom:Throat irritation, symptom:Eye pain, ...","[symptom:Dyspareunia, symptom:Night terror, sy..."
4,CA,"[symptom:Eye pain, symptom:Throat irritation, ...","[symptom:Neck pain, symptom:Myocardial infarct..."


In [None]:
from umap import UMAP
coordinates = ['x', 'y']
nn = int(results.shape[0])
umap = UMAP(n_neighbors=nn, min_dist=0, n_components=2, random_state=789, n_epochs=5000)
embedding = umap.fit_transform(results[features])
embedding = pd.DataFrame(embedding, columns=coordinates)
embedding['state'] = results['state']
embedding['symptom'] = results['symptom']
embedding['grangerCausalityPVal'] = results['grangerCausalityPVal']
embedding['abscorrelation'] = results['abscorrelation']
embedding['grangerselected'] = results['grangerselcted']

In [None]:
# Run OPTICS clustering 
from sklearn.cluster import OPTICS

optics1 = OPTICS(min_samples=20, n_jobs = -1)
optics1.fit(embedding[coordinates])
embedding['optics'] = optics1.labels_

optics2 = OPTICS(min_samples=20, n_jobs = -1)
optics2.fit(results[features])
results['optics'] = optics2.labels_