In [1]:
from data import loader
from helpers import heatmap
import pdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import matplotlib as mpl

In [4]:
mpl.rcParams['figure.dpi'] = 1000
d1 = loader.get_global_case_and_deaths_time_series_data()
continent_list = loader.get_available_and_supported_continents()  # In case you want to see continents supported to pass into the call above
# d = loader.get_united_states_case_and_death_time_series_data(county=True)  # True if you want State + County, and False if you want only State and not county level.
print(continent_list)

['Africa', 'Asia', 'Europe', 'NorthAmerica', 'SouthAmerica']


#### Helper functions

In [3]:
def plot_sim_scores(sim_matrix, terr_names, data_type, continent="Africa"):
    """
    Plot the similarity scores between given country
    and save them to "results/similarity scores/"
    """
    if len(terr_names) > 35:
        label_size = 5
    else:
        label_size = 6

    plt.imshow(sim_matrix, vmin=0, vmax=1)
    plt.colorbar()

    ax = plt.gca()

    ax.set_xticks([i for i in range(len(terr_names))])
    ax.set_xticklabels(terr_names, Rotation=90)

    ax.set_yticks([i for i in range(len(terr_names))])
    ax.set_yticklabels(terr_names)
    ax.tick_params(axis='both', which='major', labelsize=label_size)
    plt.title(data_type + " cosine similarity near " + terr_names[0])
    plt.savefig("results/similarity scores/" + continent + "/" + data_type + "_" + terr_names[0] + ".png")
    plt.clf()

In [6]:
def compute_cos_scores_with_neighbors(continent):
    
    d2 = loader.get_continent_specific_case_and_deaths_time_series_data(continent=continent)

    # Get countries and territories names in the continent
    with open("data/territory_names/" + continent + "_countries.txt", "r") as f:
        country_names = [line.rstrip() for line in f]

    neighbors = pd.read_csv("data/neighbor_map/neighbors_world.csv")

    data_names = ["Case Count", "Death Count"]

    # For each country, generate the plot in the form of country - neighbors - casecount 0
    for df_idx, df in enumerate(d2):
        total_days = np.array(["Day" in x for x in df.columns]).sum()
        columns = ["Day " + str(day) for day in range(total_days)]
        for base_country in df["Admin0"]:
            # Create a matrix to store processed data [country + neighbors, columns]
            if type(neighbors.loc[neighbors["Country or territory"]==base_country]["neighbor list"].tolist()[0]) is str:
                list_of_neighbors = neighbors.loc[neighbors["Country or territory"]==base_country]["neighbor list"].tolist()[0].split(",")
            else:
                continue
            list_of_neighbors.insert(0, base_country)
            # Remove the neighbors not in the list and leading white space
            actual_list = []
            for idx in range(len(list_of_neighbors)):
                list_of_neighbors[idx] = list_of_neighbors[idx].lstrip()
                if list_of_neighbors[idx] in country_names:
                    actual_list.append(list_of_neighbors[idx])
            list_of_neighbors = actual_list

            processed_count = np.zeros([len(list_of_neighbors) + 1, len(columns)])
            for ctry_idx, country in enumerate(list_of_neighbors):
                for idx, col in enumerate(columns):
                    processed_count[ctry_idx, idx] = df[df["Admin0"]==country][col].values

            sim_matrix = np.zeros([len(list_of_neighbors), len(list_of_neighbors)])
            for i in range(len(list_of_neighbors)):
                for j in range(len(list_of_neighbors)):
                    sim_matrix[i, j] = processed_count[i, :].dot(processed_count[j, :])/ (np.linalg.norm(processed_count[i, :]) * np.linalg.norm(processed_count[j, :]))

            plot_sim_scores(sim_matrix, list_of_neighbors, data_names[df_idx], continent)

In [13]:
def compute_cos_scores_by_bucket(bucket_length):
    
    d1 = loader.get_global_case_and_deaths_time_series_data()
    data_names = ["Case Count", "Death Count"]

    # For each country, generate the plot in the form of country - neighbors - casecount 0
    for df_idx, df in enumerate(d1):
        # Begin with 3-22-20 instead of 1-22-20
        total_days = np.array(["Day" in x for x in df.columns]).sum()
        columns = ["Day " + str(day) for day in range(60, total_days)]
        
        columns_bucket = ["Bucket " + str(bucket) for bucket in range(int((total_days-60)/bucket_length) + 2)]
        groups = []
        for bucket in range(len(columns_bucket)):
            groups.append(["Day " + str(day) for day in range(bucket * bucket_length, (bucket + 1) * bucket_length)])
        
        # Compute the bucket values
        for bucket_idx, group in enumerate(groups):
            df[columns_bucket[bucket_idx]] = df[group].sum(axis=1)
        
        # For each country, create a corresponding time series vector
        buckets = df[columns_bucket].to_numpy() # num_countries * num_buckets
        country_list = df["Admin0"].to_list()

        cosine_sims = {}
        
        # Compute the cosine similarities for each country
        for country_idx, country in enumerate(country_list):
            country_cos_sim = []
            for target_country_idx, target_country in enumerate(country_list):
                country_cos_sim.append(buckets[country_idx].dot(buckets[target_country_idx]) / (np.linalg.norm(buckets[country_idx]) * np.linalg.norm(buckets[target_country_idx])))
            cosine_sims[country] = country_cos_sim
    return pd.DataFrame(cosine_sims, columns=country_list)
print(compute_cos_scores_by_bucket(7))

  country_cos_sim.append(buckets[country_idx].dot(buckets[target_country_idx]) / (np.linalg.norm(buckets[country_idx]) * np.linalg.norm(buckets[target_country_idx])))


     Afghanistan   Albania   Algeria   Andorra    Angola  Antigua and Barbuda  \
0       1.000000  0.179438  0.539415  0.105299  0.459774             0.064464   
1       0.179438  1.000000  0.471871  0.380684  0.494683             0.551917   
2       0.539415  0.471871  1.000000  0.394163  0.749506             0.289432   
3       0.105299  0.380684  0.394163  1.000000  0.205284             0.121173   
4       0.459774  0.494683  0.749506  0.205284  1.000000             0.666442   
..           ...       ...       ...       ...       ...                  ...   
189     0.187942  0.571285  0.430138  0.714249  0.283457             0.218434   
190     0.280130  0.602343  0.499637  0.668499  0.389094             0.279676   
191     0.012585  0.019601  0.097015  0.233630  0.006909             0.024150   
192     0.392913  0.269086  0.736357  0.060959  0.855290             0.535567   
193     0.285205  0.812115  0.638423  0.605300  0.599942             0.484910   

     Argentina   Armenia   