In [7]:
import folium
import pandas as pd
import pickle
import os
import h3
from label_samples_time_hexa import label_samples
from vizualize import draw_hexagons, draw_migration_for_time_bin, draw_hexagons_with_values, draw_barriers
from func import rename_time_bins, calc_dist_time_bin, normalize_distances, get_time_bin_hexagons, get_min_max_dist

In [8]:
matrix = pd.read_pickle("/home/jaro/BINP29/Project_Eran/1_dist_matrix/eucl_dist.pkl")

In [9]:
time_bins = 18
resolution = 3
same_age_range = True
df = label_samples("/home/jaro/BINP29/Project_Eran/", time_bins, resolution, same_age_range)
time_bins = rename_time_bins(df)
time_bins_hexagons = get_time_bin_hexagons(df)

In [37]:
import numpy as np
# function that calculates the average distance between two groups of samples
def calc_avg_dist(samples_hex1, samples_hex2, dist_matrix):
    return np.mean(dist_matrix.loc[samples_hex1, samples_hex2].values.flatten())

def calc_neighbor_dist(hexagons, dist_matrix, time_bin_df, hex_col, k_neighbors = 1, allow_k_distance=False, scale_by_distance=False):
    # get the samples in each hexagon
    samples_in_hex = time_bin_df.groupby(hex_col)['ID'].apply(list).to_dict()
    # create a list of all values in samples_in_hex
    all_samples = [sample for samples in samples_in_hex.values() for sample in samples]
    # create a submatrix of the distance matrix for the samples in the hexagons
    dist_matrix = dist_matrix.loc[all_samples, all_samples]
    # initialize the dictionary to store the average distances between neighboring hexagons
    averages = {}
    # get the set of hexagons
    hexagons_set = set(hexagons)
    # initialize the cache for the average distances
    avg_dist_cache = {}
    # initialize the cache for the k-ring distances
    k_ring_distances_cache = {}

    for hexagon in hexagons:
        neighbors = dict()
        # get the neighbors of the hexagon in k distance
        for k in range(1, k_neighbors+1):
            # if the neighbors have not been calculated yet, calculate them
            if (hexagon, k) not in k_ring_distances_cache:
                k_ring_distances_cache[(hexagon, k)] = set(h for h in h3.k_ring_distances(hexagon, k)[k] if h in hexagons_set)
            neighbors[k] = k_ring_distances_cache[(hexagon, k)]
        # if there are no neighbors in k distance, and the user allows for more than k distance, get the neighbors in 20 distance
        if allow_k_distance and [len(neighbors[k]) for k in neighbors].count(0) == len(neighbors):
            k = k_neighbors + 1
            while all(len(neighbors[k]) == 0 for k in neighbors) and k < 20:
                # if the neighbors have not been calculated yet, calculate them
                if (hexagon, k) not in k_ring_distances_cache:
                    k_ring_distances_cache[(hexagon, k)] = set(h for h in h3.k_ring_distances(hexagon, k)[k] if h in hexagons_set)
                neighbors[k] = k_ring_distances_cache[(hexagon, k)]
                k += 1
                
        # calculate the average distance between the hexagon and its neighbors
        for k in neighbors.keys():
            for neighbor in neighbors[k]:
                Ids_in_hexagon = samples_in_hex.get(hexagon, [])
                Ids_in_neighbor = samples_in_hex.get(neighbor, [])
                # get the pair of hexagons
                pair = frozenset([hexagon, neighbor])

                # check if the average distance has already been calculated
                if pair not in avg_dist_cache:
                    # calculate the average distance between the hexagon and its neighbor
                    distance = calc_avg_dist(Ids_in_hexagon, Ids_in_neighbor, dist_matrix)
                    # scale the distance by the distance between the hexagon and its neighbor
                    if scale_by_distance:
                        distance = distance / (0.9 + k/10)
                    avg_dist_cache[pair] = distance
                    
                averages[pair] = round(avg_dist_cache[pair], 2)

    return averages

# this function calculates the average distance between the each hexagon and its neighbors for each time bin
def calc_dist_time_bin(df, dist_matrix=None, k_neighbors=1, allow_k_distance=False, scale_by_distance=False):
    
    # get column name for the hexagons (it should be the only column with 'hex' in the name)
    hex_col = str(df.columns[df.columns.str.contains('hex')][0])
    
    # Convert the 'AgeGroup' column values to tuples of integers representing the start and end years,
    df['AgeGroupTuple'] = df['AgeGroup'].apply(lambda x: tuple(map(int, x.split('-'))))
    
    # Sort the unique age group tuples to process them in a chronological order.
    time_bins = sorted(df['AgeGroupTuple'].unique())
    averages = {}
    # Iterate over each time bin.
    for time_bin in time_bins:
        # Format the current time bin as a string for labeling purposes.
        bin_label = rename_times(time_bin)
        
        # get subset of the data frame for that time bin
        time_bin_df = df[df['AgeGroupTuple'] == time_bin]

        # get all unique hexagons for that time bin
        hexagons = time_bin_df[hex_col].unique()
        
        # Calculate the average distance for each hexagon to its neighbors within the current time bin.
        average_distances = calc_neighbor_dist(hexagons, dist_matrix, time_bin_df, hex_col, k_neighbors, allow_k_distance, scale_by_distance)

        # Append the calculated average distances to the dictionary, using the time bin label as the key.
        averages.update({bin_label: average_distances})

    # Return the dictionary with the average distances between neighboring hexagons for each time bin.
    return averages

# function that renames the time bins into a more readable format
def rename_time_bins(df):
    # Convert the 'AgeGroup' column values to tuples of integers representing the start and end years,
    df['AgeGroupTuple'] = df['AgeGroup'].apply(lambda x: tuple(map(int, x.split('-'))))
    # Sort the unique age group tuples to process them in a chronological order.
    time_bins = sorted(df['AgeGroupTuple'].unique())
    renamed_bins = []
    for time_bin in time_bins:
        renamed_bins.append(rename_times(time_bin))
    return renamed_bins

# function that renames a time bin into a more readable format
def rename_times(time_bin):
    renamed_years = []
    # get years from time_bin
    for year in time_bin:
        # the time in the dataset is measured from 1950
        if int(year) < 1950:
            year = 1950 - int(year)
            year = str(year) + " AD"
        else:
            year = int(year) - 1950
            year = str(year) + " BC"
        renamed_years.append(year)
    return(" - ".join(renamed_years))  # Append to renamed_bins

In [38]:
# calculate the average distances between neighboring hexagons for each time bin with the given parameters
time_bins_dist = calc_dist_time_bin(df, matrix,
                                        10,
                                        True,
                                        False)

In [5]:
selected_time_bin = time_bins[0]
hexagons = time_bins_hexagons[selected_time_bin]
time_bin = time_bins_dist[selected_time_bin]
time_bin = normalize_distances(time_bin)
threshold = 0.4

In [6]:
from collections import defaultdict
# directory to save the barrier lines and hexagons with their distances
barrier_hex = defaultdict(list)
barrier_lines = {}
# dictionary to save hexagons and their direct neighbor distances to check if their was a migration barrier
hex_dist_to_direct_neighbors = defaultdict(list)
# loop over all pairs of hexagons in the time bin
for pair in time_bin:
    distance = time_bin[pair]
    pair = list(pair)
    # check if the pair are direct neighbors
    if pair[0] in h3.k_ring_distances(pair[1], 1)[1]:
        # get the line between the two hexagons
        boundary1 = h3.h3_to_geo_boundary(pair[0])
        boundary2 = h3.h3_to_geo_boundary(pair[1])
        # get the pair of dots that the two hexagons share
        shared_boundary = frozenset([x for x in boundary1 if x in boundary2])
        # add the line and its distance to the dictionary
        barrier_lines[shared_boundary] = distance
        hex_dist_to_direct_neighbors[pair[0]].append(distance)
        hex_dist_to_direct_neighbors[pair[1]].append(distance)
    # if the hexagons are further appart
    else:
        # try to draw a line between the two hexagons
        try:
            # if found add the distance to the dictionary
            line = h3.h3_line(pair[0], pair[1])[1:-1]
            for hex in line:
                if hex not in hexagons:
                    barrier_hex[hex].append(distance)
        except:
            continue


# Calculate the average distance for each hexagon and round it to 2 decimal places
barrier_hex = {hex: round(sum(distances) / len(distances), 2) for hex, distances in barrier_hex.items()}
    
# Create a list of isolated hexagons
# add isolated hexagons that have direct neighbors
isolated_hex = [hex for hex, distances in hex_dist_to_direct_neighbors.items() if all(x >= threshold for x in distances)]
# add isolated hexagons that have no direct neighbors
isolated_hex += [hex for hex in hexagons if hex not in hex_dist_to_direct_neighbors and all(barrier_hex[n] >= threshold for n in h3.k_ring_distances(hex, 1)[1] if n in barrier_hex)]


print(f"Number of isolated hexagons: {len(isolated_hex)}")
print(f"Number of barrier lines: {len(barrier_lines)}")
print(f"Number of barrier hexagons: {len(barrier_hex)}")

Number of isolated hexagons: 28
Number of barrier lines: 165
Number of barrier hexagons: 1190


In [7]:
# function that calculates the average distance between two groups of samples
def calc_avg_dist(samples_hex1, samples_hex2, dist_matrix):
    return dist_matrix.loc[samples_hex1, samples_hex2].values.flatten().mean()

def find_closest_population(time_bin_df, isolated_hex, dist_matrix, threshold):
    # get column name for the hexagons (it should be the only column with 'hex' in the name)
    hex_col = time_bin_df.filter(like='hex').columns[0]
    # get all unique hexagons from the dataframe
    hexagons = time_bin_df[hex_col].unique()
    # get the samples in each hexagon
    samples_in_hex = time_bin_df.groupby(hex_col)['ID'].apply(list).to_dict()
    # empty dictrionary to hold the distances between the hexagons
    closest_populations = {}
    
    # loop over all isolated hexagons
    for iso in isolated_hex:
        # reset the min_dist and closest_hex
        closest_hex = None
        min_dist = threshold
        # check the distance to every hexagon in that time bin
        for hex in hexagons:
            # skip if the hexagon is the same as the isolated hexagon
            if hex == iso:
                continue
            
            Ids_in_hexagon = samples_in_hex.get(iso, [])
            Ids_in_neighbor = samples_in_hex.get(hex, [])
            # calculate the average distance between the hexagon and its neighbor
            distance = calc_avg_dist(Ids_in_hexagon, Ids_in_neighbor, dist_matrix)
            # check if the distance is lower than the current minimum distance
            if distance < min_dist:
                min_dist = distance
                closest_hex = hex
        # Add closest hexagon if distance is below threshold
        if closest_hex is not None:
            pair = frozenset([iso, closest_hex])
            closest_populations[pair] = round(min_dist, 2)
    
    return closest_populations

In [8]:
    # Convert the 'AgeGroup' column values to tuples of integers representing the start and end years,
    df['AgeGroupTuple'] = df['AgeGroup'].apply(lambda x: tuple(map(int, x.split('-'))))
    
    # Sort the unique age group tuples to process them in a chronological order.
    time_bins = sorted(df['AgeGroupTuple'].unique())
    time_bin_df = df[df['AgeGroupTuple'] == time_bins[0]]
    
    # get the closest population for each isolated hexagon
    closest_populations = find_closest_population(time_bin_df, isolated_hex, matrix, threshold)


In [9]:
# this function takes a time bin and a map and draws all neighboring lines for the hexagons in that time bin
def draw_migration_for_time_bin(time_bin, m, color="green"):
    # Loop through all pairs of hexagons in the time bin
    for pair, distance in time_bin.items():
        hex1, hex2 = pair
        
        # Get the midpoints of both hexagons
        midpoint1 = h3.h3_to_geo(hex1)
        midpoint2 = h3.h3_to_geo(hex2)
        
        # Check if the points are on opposite sides of the antimeridian
        if abs(midpoint1[1] - midpoint2[1]) > 180:
            midpoint1_adj = (midpoint1[0], midpoint1[1] - 360)
            midpoint2_adj = (midpoint2[0], midpoint2[1] + 360)
            lines = [[midpoint1_adj, midpoint2], [midpoint1, midpoint2_adj]]
        else:
            lines = [[midpoint1, midpoint2]]
        
        # Loop over all lines and draw them on the map
        for line in lines:
            polyline = folium.PolyLine(locations=line, color=color, weight=2)
            polyline.add_to(m)
    
    return m

In [10]:
# function that draws hexagons on a map
def draw_hexagons(hexagons, m=None, color='orange', zoom_start=1, value=None):
    # Create a map if it is not provided
    if m is None:
        m = folium.Map(location=(0.0, 0.0), tiles="Esri worldstreetmap", zoom_start=zoom_start)

    # function that splits a hexagon if it crosses the antimeridian
    def split_hexagon_if_needed(hexagon):
        boundary = h3.h3_to_geo_boundary(hexagon, geo_json=False)
        longitudes = [lon for lat, lon in boundary]

        # Check if the hexagon crosses the antimeridian
        if max(longitudes) - min(longitudes) > 180:
            boundary = list(boundary)
            first_hex = list()
            second_hex = list()
            # make get two hexagons from the original one
            for i in range(len(boundary)):
                if boundary[i][1] <= 0:
                    first_hex.append((boundary[i][0], boundary[i][1] + 360))
                    second_hex.append((boundary[i][0], boundary[i][1]))
                if boundary[i][1] > 0:
                    first_hex.append((boundary[i][0], boundary[i][1]))
                    second_hex.append((boundary[i][0], boundary[i][1] - 360))
            # return two tuples of coordinates of the two hexagons
            return [tuple(first_hex), tuple(second_hex)]
        else:
            # return the original hexagon
            return [boundary]

    # Plot hexagons
    for hexagon in hexagons:
        # split the hexagon if it crosses the antimeridian
        parts = split_hexagon_if_needed(hexagon)
        for part in parts:
            polygon = folium.Polygon(
                locations=part,
                weight=1,
                color=None,
                fill_color=color,
                fill_opacity=0.5,
                fill=True
            )
            if value:
                polygon.add_child(folium.Tooltip(value))
            
            polygon.add_to(m)
    return m

import matplotlib.colors as mcolors
import base64

def draw_hexagons_with_values(hex_dict, m=None, zoom_start=1, threshold=0.0):
    hexagons = hex_dict.keys()
    values = hex_dict.values()
    
    # create a color gradient to color the lines based on the normalized distance
    colors = [(1, 0.5, 0), (0, 0, 0.5)]  # Dark blue to orange
    cmap = mcolors.LinearSegmentedColormap.from_list("custom_darkblue_to_orange", colors)

    # write the values to the center of each hexagon
    for hexagon, value in zip(hexagons, values):
        if value < threshold:
            continue
        col = mcolors.to_hex(cmap(value))
        m = draw_hexagons([hexagon], m, color=col, zoom_start=zoom_start, value=value)
    return m

In [11]:
m = draw_hexagons(hexagons, color = "darkgreen")
m = draw_migration_for_time_bin(closest_populations, m)
m = draw_hexagons_with_values(barrier_hex, m, threshold = 0.0)
m = draw_barriers(barrier_lines, m)
m

In [17]:
# create two hexagons using h3
hex1 = h3.geo_to_h3(37.3615593, -122.0553238, 3)
hex2 = h3.geo_to_h3(36.3615593, -122.0553238, 3)
print(h3.distance(hex1, hex2))

hex1_dots = h3.h3_to_geo_boundary(hex1)
hex2_dots = h3.h3_to_geo_boundary(hex2)
# get the pair of dots that the two hexagons share
dot = [x for x in hex1_dots if x in hex2_dots]
dot
# m = draw_hexagons([hex1, hex2], color = 'blue')
# # draw the line between the two hexagons
# folium.PolyLine(dot, color = 'red').add_to(m)
# m




AttributeError: module 'h3' has no attribute 'distance'