In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import folium
from folium.plugins import FastMarkerCluster
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
datum = "EPSG:4326"



In [84]:
data_path = r"C:\Users\jua12849\Documents\GitHub\GeospatialDataAnalysis\canadacities.csv"


canadian_cities = pd.read_csv(data_path)

#create geodataframe containing data with all canadian cities and a point geometry column
geometry = [Point(xy) for xy in zip(canadian_cities["lng"],canadian_cities["lat"])]
gdf = gpd.GeoDataFrame(canadian_cities,crs=datum,geometry=geometry)

#create dictionary with complete data for each province
d = {}
for city in gdf.province_id.unique():
    d["city_{}".format(city)] = gdf.loc[gdf["province_id"]==city]

#obtain province names as well as list of dictionary keys.
provinces = list(d.keys())

#obtain lat/long data for each province and the entire country as a numpy array.
d_lat_lon_numpy = {}
for province in provinces:
    d_lat_lon_numpy["{}".format(province)] = [d.get(province)[["lat","lng"]].to_numpy()]

d_lat_lon_numpy["Canada"] = [gdf[["lat","lng"]].to_numpy()]


kms_per_radian = 6371.0088
#epsilon = 50 / kms_per_radian
epsilon = 0.01
min_samples = 2

#perform DBSCAN algorithm to each province separately as well as the entire country
for province in list(d_lat_lon_numpy.keys()):
    #Create DBSCAN object and apply to each latitude/longitude pair
    d_lat_lon_numpy["{}".format(province)].append(
        {"dbs_{}".format(province):DBSCAN(eps=epsilon, min_samples=min_samples,algorithm = 'ball_tree',metric='haversine').fit(np.radians(d_lat_lon_numpy.get(province)[0]))})
    #Retrieve labels obtained from algorithm
    d_lat_lon_numpy["{}".format(province)].append(
        {"{}_cluster_label".format(province):d_lat_lon_numpy.get(province)[1]["dbs_{}".format(province)].labels_})
    #Obtain number of clusters
    d_lat_lon_numpy["{}".format(province)].append(
        {"{}_num_clusters".format(province):len(set(d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)]))})
    #
    d_lat_lon_numpy["{}".format(province)].append(
        {"{}_clusters".format(province):
        pd.Series(d_lat_lon_numpy.get(province)[0][d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)] == n] for n in range(d_lat_lon_numpy["{}".format(province)][3]["{}_num_clusters".format(province)]))})

    #check for empty clusters
    final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))
    if len(final_clusters.iloc[-1]) == 0:
        #print(province)
        #print(final_clusters.iloc[-1])
        final_clusters.drop(final_clusters.tail(1).index,inplace=True) 



for province in list(d_lat_lon_numpy.keys()):
    final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))
    #print(len(final_clusters))
    
    if len(final_clusters) == 0:
        #print("entering if condition settind dbscan min samples to 1")
        #print(province)
        
        #print("erasing previous dictionary")
        del d_lat_lon_numpy["{}".format(province)]
        
        #print("recreating initial array")
        d_lat_lon_numpy["{}".format(province)] = [d.get(province)[["lat","lng"]].to_numpy()]

        
        #Create DBSCAN object and apply to each latitude/longitude pair
        d_lat_lon_numpy["{}".format(province)].append(
            {"dbs_{}".format(province):DBSCAN(eps=epsilon, min_samples=1,algorithm = 'ball_tree',metric='haversine').fit(np.radians(d_lat_lon_numpy.get(province)[0]))})
        #Retrieve labels obtained from algorithm
        d_lat_lon_numpy["{}".format(province)].append(
            {"{}_cluster_label".format(province):d_lat_lon_numpy.get(province)[1]["dbs_{}".format(province)].labels_})
        #Obtain number of clusters
        d_lat_lon_numpy["{}".format(province)].append(
            {"{}_num_clusters".format(province):len(set(d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)]))})
        #
        d_lat_lon_numpy["{}".format(province)].append(
            {"{}_clusters".format(province):
            pd.Series(d_lat_lon_numpy.get(province)[0][d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)] == n] for n in range(d_lat_lon_numpy["{}".format(province)][3]["{}_num_clusters".format(province)]))})
        
    
def get_centermost_point(cluster):
    
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

for province in list(d_lat_lon_numpy.keys()):

    final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))

    if len(final_clusters) > 0 :

        d_lat_lon_numpy["{}".format(province)].append(
            {"{}_centermost_points".format(province):d_lat_lon_numpy.get(province)[4]["{}_clusters".format(province)].map(get_centermost_point)})

        #unzip the list of centermost points (lat,lon) tuples into separate lat/lon lists
        lats, lons = zip(*d_lat_lon_numpy.get(province)[5]["{}_centermost_points".format(province)])
        #create a pandas dataframe
        rep_points = pd.DataFrame({'lon':lons, 'lat':lats})

        d_lat_lon_numpy["{}".format(province)].append({"{}_centermost_points_numpy".format(province) : rep_points.to_numpy()})


        d_lat_lon_numpy["{}".format(province)].append(
            {"{}_gdf_cluster_samples".format(province):gpd.GeoDataFrame(rep_points, geometry=gpd.points_from_xy(rep_points.lon, rep_points.lat),crs = "EPSG:4326" )})

    
#mean for ontario
mean_lat_on = np.mean(d["city_ON"]["lat"])
mean_lng_on = np.mean(d["city_ON"]["lng"])

#mean for canada
gdf_mean_lat = np.mean(gdf.lat)
gdf_mean_lng = np.mean(gdf.lng)

clusters = {}
cities = {}

for province in provinces:
    clusters["{}".format(province)] = d_lat_lon_numpy.get("{}".format(province))[6].get("{}_centermost_points_numpy".format(province))

    cities["{}".format(province)] = d_lat_lon_numpy.get("{}".format(province))[0]


    


In [85]:
cities.keys()

dict_keys(['city_ON', 'city_QC', 'city_BC', 'city_AB', 'city_MB', 'city_NS', 'city_SK', 'city_NL', 'city_NB', 'city_PE', 'city_YT', 'city_NT', 'city_NU'])

In [86]:
study = np.concatenate([cities["city_ON"],
                            cities["city_QC"],
                            cities["city_NB"],
                            cities["city_NS"],
                            cities["city_AB"],
                            cities["city_MB"],
                            cities["city_SK"],
                            cities["city_BC"]])

study_clusters = np.concatenate([clusters["city_ON"],
                            clusters["city_QC"],
                            clusters["city_NB"],
                            clusters["city_NS"],
                            clusters["city_AB"],
                            clusters["city_MB"],
                            clusters["city_SK"],
                            clusters["city_BC"]])


my_map = folium.Map(location=[gdf_mean_lat,gdf_mean_lng], zoom_start=5)


for point in study_clusters :
    loc = [point[1],point[0]]
    folium.Marker(location=loc,icon=folium.Icon(color="red")).add_to(my_map)
    #folium.Circle(radius=40000,location=[point[1],point[0]],color="red").add_to(my_map)

for point in study :
    loc = [point[0],point[1]]
    #folium.Marker(location=loc,icon=folium.Icon(color="blue")).add_to(my_map)
    folium.Circle(radius=4000,location=loc,color="BLUE").add_to(my_map)
    
#folium.GeoJson(data = gdf).add_to(my_map)
    

my_map

In [26]:
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import folium
from folium.plugins import FastMarkerCluster

In [226]:
def import_data():
    canadian_cities = pd.read_csv(r"C:\Users\jua12849\Documents\GitHub\GeospatialDataAnalysis\canadacities.csv")
    datum = "EPSG:4326"

    #create geodataframe containing data with all canadian cities and a point geometry column
    geometry = [Point(xy) for xy in zip(canadian_cities["lng"],canadian_cities["lat"])]
    gdf = gpd.GeoDataFrame(canadian_cities,crs=datum,geometry=geometry)

    return gdf


def create_gdf_dictionary(gdf):
    #create dictionary with complete data for each province
    d = {}
    for city in gdf.province_id.unique():
        d["city_{}".format(city)] = gdf.loc[gdf["province_id"]==city]

    return d

def obtain_provinces(d):
    provinces = list(d.keys())
    
    return provinces


def create_numpy_dictionary(d,gdf,provinces):
    #obtain province names as well as list of dictionary keys.

    #obtain lat/long data for each province and the entire country as a numpy array.
    d_lat_lon_numpy = {}
    for province in provinces:
        d_lat_lon_numpy["{}".format(province)] = [d.get(province)[["lat","lng"]].to_numpy()]

    d_lat_lon_numpy["Canada"] = [gdf[["lat","lng"]].to_numpy()]

    return d_lat_lon_numpy

def replace_dictionary(d_lat_lon_numpy,province,d):

    d_lat_lon_numpy["{}".format(province)] = [d.get(province)[["lat","lng"]].to_numpy()]
    
    return d_lat_lon_numpy


def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

def apply_DBSCAN(d_lat_lon_numpy,province,epsilon,min_samples,algorithm = 'ball_tree',metric='haversine'):
    return d_lat_lon_numpy["{}".format(province)].append(
            {"dbs_{}".format(province):DBSCAN(eps=epsilon, min_samples=min_samples,algorithm = algorithm,metric=metric).fit(np.radians(d_lat_lon_numpy.get(province)[0]))})

def retrieve_labels(d_lat_lon_numpy,province):
    return d_lat_lon_numpy["{}".format(province)].append(
            {"{}_cluster_label".format(province):d_lat_lon_numpy.get(province)[1]["dbs_{}".format(province)].labels_})

def obtain_cluster_labels(d_lat_lon_numpy,province):
    return d_lat_lon_numpy["{}".format(province)].append(
            {"{}_num_clusters".format(province):len(set(d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)]))})

def obtain_clusters_numpy(d_lat_lon_numpy,province):
    return d_lat_lon_numpy["{}".format(province)].append(
            {"{}_clusters".format(province):
            pd.Series(d_lat_lon_numpy.get(province)[0][d_lat_lon_numpy.get(province)[2]["{}_cluster_label".format(province)] == n] for n in range(d_lat_lon_numpy["{}".format(province)][3]["{}_num_clusters".format(province)]))})


def perform_dbscan(d_lat_lon_numpy,epsilon,min_samples,d):

    #perform DBSCAN algorithm to each province separately as well as the entire country
    for province in list(d_lat_lon_numpy.keys()):
        #Create DBSCAN object and apply to each latitude/longitude pair
        apply_DBSCAN(d_lat_lon_numpy=d_lat_lon_numpy,province=province,epsilon=epsilon,min_samples=min_samples,algorithm = 'ball_tree',metric='haversine')
        #Retrieve labels obtained from algorithm
        retrieve_labels(d_lat_lon_numpy=d_lat_lon_numpy,province=province)
        #Obtain cluster labels
        obtain_cluster_labels(d_lat_lon_numpy=d_lat_lon_numpy,province=province)
        #obtain clusters numpy
        obtain_clusters_numpy(d_lat_lon_numpy=d_lat_lon_numpy,province=province)

        # Check for empty clusters, DBSCAN function does not like them
        # We edrop any dictionaries with empty clusters to later run them again
        # With only one DBSCAN neighbour (no noise)
        final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))
        if len(final_clusters.iloc[-1]) == 0:
            final_clusters.drop(final_clusters.tail(1).index,inplace=True)
        
    for province in list(d_lat_lon_numpy.keys()):

        final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))
        if len(final_clusters) == 0:
            del d_lat_lon_numpy["{}".format(province)]
            replace_dictionary(d_lat_lon_numpy,province,d)
            #min_samples_final = 1
            #Create DBSCAN object and apply to each latitude/longitude pair
            apply_DBSCAN(d_lat_lon_numpy=d_lat_lon_numpy,province=province,epsilon=epsilon,min_samples=1,algorithm = 'ball_tree',metric='haversine')
            #Retrieve labels obtained from algorithm
            retrieve_labels(d_lat_lon_numpy=d_lat_lon_numpy,province=province)
            #Obtain cluster labels
            obtain_cluster_labels(d_lat_lon_numpy=d_lat_lon_numpy,province=province)
            #obtain clusters numpy
            obtain_clusters_numpy(d_lat_lon_numpy=d_lat_lon_numpy,province=province)

    for province in list(d_lat_lon_numpy.keys()):

        final_clusters = d_lat_lon_numpy.get(province)[4].get("{}_clusters".format(province))
        if len(final_clusters) > 0 :

            d_lat_lon_numpy["{}".format(province)].append(
                {"{}_centermost_points".format(province):d_lat_lon_numpy.get(province)[4]["{}_clusters".format(province)].map(get_centermost_point)})

            #unzip the list of centermost points (lat,lon) tuples into separate lat/lon lists
            lats, lons = zip(*d_lat_lon_numpy.get(province)[5]["{}_centermost_points".format(province)])
            #create a pandas dataframe
            rep_points = pd.DataFrame({'lon':lons, 'lat':lats})

            d_lat_lon_numpy["{}".format(province)].append({"{}_centermost_points_numpy".format(province) : rep_points.to_numpy()})

            d_lat_lon_numpy["{}".format(province)].append(
                {"{}_gdf_cluster_samples".format(province):gpd.GeoDataFrame(rep_points, geometry=gpd.points_from_xy(rep_points.lon, rep_points.lat),crs = "EPSG:4326" )})

    return d_lat_lon_numpy

def calculate_mean_ontario_loc(d):    
    #mean location for ontario cities
    mean_lat_on = np.mean(d["city_ON"]["lat"])
    mean_lng_on = np.mean(d["city_ON"]["lng"])
    
    return mean_lat_on,mean_lng_on

def calculate_mean_canada_loc(gdf):
    #mean location for canada cities
    gdf_mean_lat = np.mean(gdf.lat)
    gdf_mean_lng = np.mean(gdf.lng)

    return gdf_mean_lat,gdf_mean_lng

def cities_dict(d_lat_lon_numpy,provinces):
    cities = {}
    for province in provinces:
        cities["{}".format(province)] = d_lat_lon_numpy.get("{}".format(province))[0]
    
    return cities

def clusters_dict(d_lat_lon_numpy,provinces):
    clusters={}
    
    for province in provinces:
        #print(d_lat_lon_numpy.get("{}".format(province)))#[6])
        clusters["{}".format(province)] = d_lat_lon_numpy.get("{}".format(province))[6].get("{}_centermost_points_numpy".format(province))
    
    return clusters

def study_area_numpy(cities):
    study_area = np.concatenate([cities["city_ON"],
                                cities["city_QC"],
                                cities["city_NB"],
                                cities["city_NS"]])

    return study_area

def cluster_area_numpy(clusters):
    study_clusters = np.concatenate([clusters["city_ON"],
                                clusters["city_QC"],
                                clusters["city_NB"],
                                clusters["city_NS"]])
    
    return study_clusters


def create_map(gdf_mean_lat,gdf_mean_lng,study_area,study_clusters,zoom):

    my_map = folium.Map(location=[gdf_mean_lat,gdf_mean_lng], zoom_start=zoom)

    for point in study_clusters :
        loc = [point[1],point[0]]
        folium.Marker(location=loc,icon=folium.Icon(color="red")).add_to(my_map)
        #folium.Circle(radius=40000,location=[point[1],point[0]],color="red").add_to(my_map)

    for point in study_area :
        loc = [point[0],point[1]]
        #folium.Marker(location=loc,icon=folium.Icon(color="blue")).add_to(my_map)
        folium.Circle(radius=4000,location=loc,color="BLUE").add_to(my_map)
    
    #folium.GeoJson(data = gdf).add_to(my_map)    

    return my_map 


In [243]:
epsilon = 0.01
min_samples = 2

gdf = import_data()
d = create_gdf_dictionary(gdf)
provinces = obtain_provinces(d)
d_lat_lon_numpy = create_numpy_dictionary(d,gdf,provinces)
d_lat_lon_numpy = perform_dbscan(d_lat_lon_numpy,epsilon = epsilon,min_samples=min_samples,d=d)
mean_lat_on,mean_lng_on = calculate_mean_ontario_loc(d)
gdf_mean_lat, gdf_mean_lng = calculate_mean_canada_loc(gdf)
clusters = clusters_dict(d_lat_lon_numpy,provinces)
cities = cities_dict(d_lat_lon_numpy,provinces)
study_area = study_area_numpy(cities)
study_clusters = cluster_area_numpy(clusters)
map = create_map(gdf_mean_lat,gdf_mean_lng,study_area,study_clusters,zoom=5)


In [239]:
map

In [248]:
def run_map(epsilon,min_samples):
    gdf = import_data()
    d = create_gdf_dictionary(gdf)
    provinces = obtain_provinces(d)
    d_lat_lon_numpy = create_numpy_dictionary(d,gdf,provinces)
    d_lat_lon_numpy = perform_dbscan(d_lat_lon_numpy,epsilon = epsilon,min_samples=min_samples,d=d)
    mean_lat_on,mean_lng_on = calculate_mean_ontario_loc(d)
    gdf_mean_lat, gdf_mean_lng = calculate_mean_canada_loc(gdf)
    clusters = clusters_dict(d_lat_lon_numpy,provinces)
    cities = cities_dict(d_lat_lon_numpy,provinces)
    study_area = study_area_numpy(cities)
    study_clusters = cluster_area_numpy(clusters)
    map = create_map(gdf_mean_lat,gdf_mean_lng,study_area,study_clusters,zoom=5)

    return map


In [254]:
run_map(epsilon = 0.019, min_samples = 1)