In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from colour import Color

In [59]:
def get_dataset(csv_name):
    dataset = pd.read_csv(csv_name, delimiter=";")
    dataset = dataset.drop_duplicates()

    # pick columns to use
    dataset["Latitude"] = dataset["Coordinates"].str.split(",").str[0].astype(float)
    dataset["Longitude"] = dataset["Coordinates"].str.split(",").str[1].astype(float)
    columns = ['Geoname ID', 'Name', 'Population', 'Country name EN', 'Latitude', 'Longitude']
    dataset = dataset[columns]
    return dataset

dataset = get_dataset('geonames.csv')
dataset.head()

Unnamed: 0,Geoname ID,Name,Population,Country name EN,Latitude,Longitude
0,2549076,Ezzhiliga,4211,Morocco,33.3,-6.53
1,2550985,Driouch,16096,Morocco,34.97705,-3.37902
2,2552615,Dar Bouazza,165295,Morocco,33.51535,-7.81677
3,2552886,Dar Chaoui,1401,Morocco,35.5377,-5.71742
4,2555157,Bouarfa,31499,Morocco,32.53379,-1.96209


In [60]:
min_pop = dataset["Population"].min()
max_pop = dataset["Population"].max()
avg_pop = dataset["Population"].mean()
min_dist = 0

In [61]:
def get_dist(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the earth in km
    dLat = np.deg2rad(lat2 - lat1)
    dLon = np.deg2rad(lon2 - lon1)
    a = np.sin(dLat / 2) * np.sin(dLat / 2) + np.cos(np.deg2rad(lat1)) * np.cos(np.deg2rad(lat2)) * np.sin(
        dLon / 2) * np.sin(dLon / 2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    d = R * c  # Distance in km
    return d


In [62]:
max_dist = 0

def get_max_dist(dataset_item, dataset):
    global max_dist
    lat1 = dataset_item["Latitude"]
    lon1 = dataset_item["Longitude"]
    city_max_dist = 0
    for _, row in dataset.iterrows():
        lat2 = row["Latitude"]
        lon2 = row["Longitude"]
        dist = get_dist(lat1, lon1, lat2, lon2)
        if dist > city_max_dist:
            city_max_dist = dist
            if dist > max_dist:
                max_dist = dist
    return city_max_dist

def get_city_item(name_or_id, dataset):
    def get_city_with_max_pop(subset):
        return subset[subset["Population"] == subset["Population"].max()].iloc[0]
    if isinstance(name_or_id, int):
        subset = dataset[dataset["Geoname ID"] == name_or_id]
    else:
        subset = dataset[dataset["Name"].str.contains(name_or_id) == True]
    return get_city_with_max_pop(subset)

atlanta = get_city_item("New York", dataset)
get_max_dist(atlanta, dataset), max_dist

(18939.32070449784, 18939.32070449784)

In [63]:
def get_viability(city_1, city_2, dataset, A = 1, B = 1, verbose = False):
    global min_pop, max_pop, avg_pop, min_dist, max_dist
    pop_1 = city_1["Population"]
    pop_2 = city_2["Population"]
    sum_pop = pop_1 + pop_2
    dist = get_dist(city_1["Latitude"], city_1["Longitude"], city_2["Latitude"], city_2["Longitude"])
    normalized_pop = (sum_pop - min_pop) / (max_pop + avg_pop - min_pop)
    normalized_dist = (dist - min_dist) / (max_dist - min_dist)
    if verbose:
        print(sum_pop, normalized_pop, dist, normalized_dist)
    return A * normalized_pop - B * normalized_dist

# city_1 = get_city_item(5110302, dataset)  # Brooklyn
# city_2 = get_city_item(5133273, dataset)  # Queens
# print(get_viability(city_1, city_2, dataset))
# print()

# city_1 = get_city_item("Atlanta", dataset)
# city_2 = get_city_item("Charleston", dataset)
# print(get_viability(city_1, city_2, dataset))

In [64]:
def find_most_viable_city(city_name, dataset):
    city = get_city_item(city_name, dataset)
    max_viability = 0
    max_viable_city = None
    for _, row in dataset.iterrows():
        if row["Geoname ID"] != city["Geoname ID"]:
            viability = get_viability(city, row, dataset)
            if viability > max_viability:
                max_viability = viability
                max_viable_city = row
    return max_viable_city

def find_top_viable_cities(city_name, dataset, top_n = 5):
    city = get_city_item(city_name, dataset)
    viabilities = []
    for _, row in dataset.iterrows():
        if row["Geoname ID"] != city["Geoname ID"]:
            viability = get_viability(city, row, dataset)
            viabilities.append((viability, row))
    viabilities.sort(key=lambda x: x[0], reverse=True)
    return viabilities[:top_n]

# mvc = find_most_viable_city("Atlanta", dataset)

# find_top_viable_cities("Atlanta", dataset, 5)

In [65]:
def get_viable_cities_paths(city_1_name, dataset, top_n = 5):
    city_1 = get_city_item(city_1_name, dataset)
    most_viable_cities = find_top_viable_cities(city_1_name, dataset, top_n)
    paths_dict = []
    for city in most_viable_cities:
        city_2 = city[1]
        paths_dict.append(
            {
                "path": [
                    [city_1["Longitude"], city_1["Latitude"]],
                    [city_2["Longitude"], city_2["Latitude"]],
                ],
                "viability": city[0],
                "city_2": city_2["Name"],
            }
        )
    red = Color("red")
    colors = list(red.range_to(Color("green"), len(paths_dict)))
    for i, path in enumerate(paths_dict):
        path['color'] = colors[i].hex_l
    return pd.DataFrame(paths_dict)

df1 = get_viable_cities_paths("Atlanta", dataset, 5)
df1.to_csv("atlanta.csv", index=False)
df1.head()

Unnamed: 0,path,viability,city_2,color
0,"[[-84.38798, 33.749], [-99.12766, 19.42847]]",0.457056,Mexico City,#ff0000
1,"[[-84.38798, 33.749], [121.45806, 31.22222]]",0.37023,Shanghai,#df7000
2,"[[-84.38798, 33.749], [-74.00597, 40.71427]]",0.35146,New York City,#bfbf00
3,"[[-84.38798, 33.749], [116.39723, 39.9075]]",0.259564,Beijing,#50a000
4,"[[-84.38798, 33.749], [3.39467, 6.45407]]",0.212944,Lagos,#008000
