In [2]:
from colour import Color
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydeck as pdk


def get_dataset(csv_name):
    dataset = pd.read_csv(csv_name, delimiter=";")
    dataset = dataset.drop_duplicates()

    # pick columns to use
    dataset["Latitude"] = dataset["Coordinates"].str.split(",").str[0].astype(float)
    dataset["Longitude"] = dataset["Coordinates"].str.split(",").str[1].astype(float)
    columns = [
        "Geoname ID",
        "Name",
        "Population",
        "Country name EN",
        "Latitude",
        "Longitude",
    ]
    dataset = dataset[columns]
    return dataset


def get_dist(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the earth in km
    dLat = np.deg2rad(lat2 - lat1)
    dLon = np.deg2rad(lon2 - lon1)
    a = np.sin(dLat / 2) * np.sin(dLat / 2) + np.cos(np.deg2rad(lat1)) * np.cos(
        np.deg2rad(lat2)
    ) * np.sin(dLon / 2) * np.sin(dLon / 2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    d = R * c  # Distance in km
    return d


def get_max_dist(dataset_item, dataset):
    lat1 = dataset_item["Latitude"]
    lon1 = dataset_item["Longitude"]
    city_max_dist = 0
    for _, row in dataset.iterrows():
        lat2 = row["Latitude"]
        lon2 = row["Longitude"]
        dist = get_dist(lat1, lon1, lat2, lon2)
        if dist > city_max_dist:
            city_max_dist = dist
    return city_max_dist


def get_city_item(name_or_id, dataset):
    def get_city_with_max_pop(subset):
        return subset[subset["Population"] == subset["Population"].max()].iloc[0]

    if isinstance(name_or_id, int):
        subset = dataset[dataset["Geoname ID"] == name_or_id]
    else:
        subset = dataset[dataset["Name"].str.contains(name_or_id) == True]
    return get_city_with_max_pop(subset)


def get_viability(city_1, city_2, dataset, dataset_statistics, A=1, B=1, verbose=False):
    min_pop, max_pop, avg_pop, min_dist, max_dist = (
        dataset_statistics["min_pop"],
        dataset_statistics["max_pop"],
        dataset_statistics["avg_pop"],
        dataset_statistics["min_dist"],
        dataset_statistics["max_dist"],
    )
    pop_1 = city_1["Population"]
    pop_2 = city_2["Population"]
    sum_pop = pop_1 + pop_2
    dist = get_dist(
        city_1["Latitude"], city_1["Longitude"], city_2["Latitude"], city_2["Longitude"]
    )
    normalized_pop = (sum_pop - min_pop) / (max_pop + avg_pop - min_pop)
    normalized_dist = (dist - min_dist) / (max_dist - min_dist)
    if verbose:
        print(sum_pop, normalized_pop, dist, normalized_dist)
    return A * normalized_pop - B * normalized_dist


def find_most_viable_city(city_name, dataset):
    city = get_city_item(city_name, dataset)
    max_viability = 0
    max_viable_city = None
    for _, row in dataset.iterrows():
        if row["Geoname ID"] != city["Geoname ID"]:
            viability = get_viability(city, row, dataset)
            if viability > max_viability:
                max_viability = viability
                max_viable_city = row
    return max_viable_city


def find_top_viable_cities(city_name, dataset, dataset_statistics, top_n=5):
    city = get_city_item(city_name, dataset)
    viabilities = []
    for _, row in dataset.iterrows():
        if row["Geoname ID"] != city["Geoname ID"]:
            viability = get_viability(city, row, dataset, dataset_statistics)
            viabilities.append((viability, row))
    viabilities.sort(key=lambda x: x[0], reverse=True)
    return viabilities[:top_n]


def get_viable_cities_paths(city_1_name, dataset, dataset_statistics, top_n=5):
    city_1 = get_city_item(city_1_name, dataset)
    most_viable_cities = find_top_viable_cities(
        city_1_name, dataset, dataset_statistics, top_n
    )
    paths_dict = []
    for city in most_viable_cities:
        city_2 = city[1]
        paths_dict.append(
            {
                "path": [
                    [city_1["Longitude"], city_1["Latitude"]],
                    [city_2["Longitude"], city_2["Latitude"]],
                ],
                "viability": city[0],
                "city_2": city_2["Name"],
            }
        )
    red = Color("red")
    colors = list(red.range_to(Color("green"), len(paths_dict)))
    for i, path in enumerate(paths_dict):
        path["color"] = colors[i].hex_l
    return pd.DataFrame(paths_dict)


# mvc = find_most_viable_city("Atlanta", dataset)

# find_top_viable_cities("Atlanta", dataset, 5)

dataset = get_dataset("geonames.csv")
print(dataset.head())
dataset_statistics = {
    "min_pop": dataset["Population"].min(),
    "max_pop": dataset["Population"].max(),
    "avg_pop": dataset["Population"].mean(),
    "min_dist": 0,
    "max_dist": get_max_dist(get_city_item("Atlanta", dataset), dataset),
}

df1 = get_viable_cities_paths("Atlanta", dataset, dataset_statistics, 5)
print(df1)


   Geoname ID         Name  Population Country name EN  Latitude  Longitude
0     2549076    Ezzhiliga        4211         Morocco  33.30000   -6.53000
1     2550985      Driouch       16096         Morocco  34.97705   -3.37902
2     2552615  Dar Bouazza      165295         Morocco  33.51535   -7.81677
3     2552886   Dar Chaoui        1401         Morocco  35.53770   -5.71742
4     2555157      Bouarfa       31499         Morocco  32.53379   -1.96209
                                           path  viability         city_2  \
0  [[-84.38798, 33.749], [-99.12766, 19.42847]]   0.452561    Mexico City   
1  [[-84.38798, 33.749], [-74.00597, 40.71427]]   0.348961  New York City   
2  [[-84.38798, 33.749], [121.45806, 31.22222]]   0.344624       Shanghai   
3   [[-84.38798, 33.749], [116.39723, 39.9075]]   0.235516        Beijing   
4     [[-84.38798, 33.749], [3.39467, 6.45407]]   0.193363          Lagos   

     color  
0  #ff0000  
1  #df7000  
2  #bfbf00  
3  #50a000  
4  #008000  


In [5]:
print(eval(df1["path"][0]))


TypeError: eval() arg 1 must be a string, bytes or code object