In [1]:
import folium
import pandas as pd
import pickle
import os
import h3
from label_samples_time_hexa import label_samples
from vizualize import *
from func import *

In [2]:
matrix = pd.read_pickle("/home/jaro/BINP29/Project_Eran/1_dist_matrix/eucl_dist.pkl")

In [3]:
time_bins = 11
resolution = 3
same_age_range = True
df = label_samples("/home/jaro/BINP29/Project_Eran/", time_bins, resolution, same_age_range)
time_bins = rename_time_bins(df)

In [4]:
time_bins_dist = calc_dist_time_bin(df, matrix)

In [5]:
selected_time_bin = time_bins[5]
time_bin = time_bins_dist[selected_time_bin]
time_bin, hexagons = get_hexagons(time_bin)
threshold = 0.4

In [6]:
def scale_distances(time_bin, exsiting_pred=None, resolution=3):
    """
    Scales genetic distances by the estimated genetic differences modeled by a LOESS function of the geographic distances.

    Parameters:
    - time_bin: A dictionary where keys are pairs of hexagons and values are genetic distances between them.

    Returns:
    - output: A dictionary where keys are pairs of hexagons and values are scaled genetic distances.
    """

    def get_km_distance(hex1, hex2=None, resolution=3):
        """
        Calculates the geographic distance in kilometers between two hexagons.

        Parameters:
        - hex1: H3 index of the first hexagon.
        - hex2: H3 index of the second hexagon.

        Returns:
        - The distance in kilometers between the two hexagons.
        """
        if hex2 is None:
            return 1281/(2.65**resolution)
        coord1 = h3.h3_to_geo(hex1)
        coord2 = h3.h3_to_geo(hex2)
        return haversine(coord1, coord2)

    # Calculate km distances between the hexagons
    km_time_bin = {pair: get_km_distance(*pair) for pair in time_bin}

    # Convert genetic and geographic distances to numpy arrays
    gen_distances = np.array(list(time_bin.values()))
    geo_distances = np.array(list(km_time_bin.values()))
    
    print(gen_distances.shape, geo_distances.shape)
    # if there is no existing prediction, create one
    if exsiting_pred is None:
        # Apply LOESS smoothing to the genetic distances based on geographic distances
        lowess = sm.nonparametric.lowess
        gen_distances_pred = lowess(gen_distances, geo_distances, frac=0.5)
    else:
        gen_distances_pred = exsiting_pred

    # Scale genetic distances by the predicted values from the LOESS model
    output = {}
    for pair in time_bin:
        km_distance = km_time_bin[pair]
        # if km_distance not in gen_distances_pred[:, 0]: find the one closest to it
        if km_distance not in gen_distances_pred[:, 0]:
            km_distance = gen_distances_pred[:, 0][np.argmin(np.abs(gen_distances_pred[:, 0] - km_distance))]
        gen_distance = time_bin[pair]
        gen_distance_pred = gen_distances_pred[gen_distances_pred[:, 0] == km_distance][:, 1][0]
        output[pair] = round(gen_distance / gen_distance_pred,2)

    return output, gen_distances_pred

In [7]:
time_bin = scale_distances(time_bin)

(413,) (413,)


In [31]:
time_bin

{frozenset({'831ecafffffffff', '833f29fffffffff'}): 1.4918836968795213,
 frozenset({'833f29fffffffff', '833f2bfffffffff'}): 0.39662032023205945,
 frozenset({'832db1fffffffff', '833f29fffffffff'}): 1.9677544444669102,
 frozenset({'831ed0fffffffff', '833f29fffffffff'}): 0.5735335921614918,
 frozenset({'831ed1fffffffff', '833f29fffffffff'}): 0.8407409744462725,
 frozenset({'831ecafffffffff', '831eeafffffffff'}): 2.49724319981189,
 frozenset({'831ecafffffffff', '832db1fffffffff'}): 2.3199852741459495,
 frozenset({'831ec0fffffffff', '831ecafffffffff'}): 2.522414677668337,
 frozenset({'831ecafffffffff', '831ed1fffffffff'}): 1.0314220816226642,
 frozenset({'831ecafffffffff', '831eebfffffffff'}): 2.2075653651218565,
 frozenset({'831ec9fffffffff', '831ecafffffffff'}): 1.461615711994722,
 frozenset({'831ec2fffffffff', '831ecafffffffff'}): 1.5354949469076573,
 frozenset({'8396a5fffffffff', '83a815fffffffff'}): 1.08999233781333,
 frozenset({'833f2bfffffffff', '8396a5fffffffff'}): 0.919311694321240

In [8]:
lines = get_distance_lines(time_bin)
m = draw_sample_hexagons(hexagons)
m = draw_barriers(lines, m)
m

In [38]:
threshold = 0.4
isolated_hex, barrier_lines, barrier_hex = get_isolated_hex_and_barriers(time_bin, hexagons, threshold, allowed_distance=10)

print(f"Number of isolated hexagons: {len(isolated_hex)}")
print(f"Number of barrier lines: {len(barrier_lines)}")
print(f"Number of barrier hexagons: {len(barrier_hex)}")

Number of isolated hexagons: 12
Number of barrier lines: 50
Number of barrier hexagons: 670


In [39]:
# get the index of the time bin of interest
time_bin_index = time_bins.index(time_bins[5])
# get the closest population for each isolated hexagon
closest_populations, new_isolated_hex = find_closest_population(df, time_bin_index, isolated_hex, matrix, threshold)
print(f"Number of isolated hexagons with no migration: {len(new_isolated_hex)}")

Number of isolated hexagons with no migration: 8


In [42]:
imputed_hex = impute_missing_hexagons(barrier_hex)

In [48]:
#m = draw_hexagons(new_isolated_hex, color = "red", m = m, opacity=0.7, value="No migration")
m = draw_hexagons_with_values(barrier_hex, m)
m = draw_hexagons_with_values(imputed_hex, m, imputed=True)
m = draw_sample_hexagons(hexagons, m)
m = draw_barriers(barrier_lines, m)
m = draw_migration_for_time_bin(closest_populations, m)
m