In [2]:
import pandas as pd

### Load and Clean data

In [3]:
xls = pd.ExcelFile('donnees_entrepots.xls')
entrepot = pd.read_excel(xls, "Données_détaillées_par_aires")
entrepot.columns = entrepot.iloc[1]
entrepot = entrepot.drop([0,1], axis=0).reset_index(drop=True)

# we keep only the first city where the area is located
entrepot["commune_clean"] = entrepot["Communes concernées par l'aire logistique"].apply(lambda x : x.split(",")[0])

In [5]:
# Reduce dataset to large logistic areas
entrepot_majeur = entrepot[entrepot["Surface totale"] > 100000]

In [1]:
# Localize the cities
from geopy import Nominatim

def add_location(ini_dataset) :
    locator = Nominatim(user_agent="myGeocoder")
    ini_dataset["location"] = ini_dataset["commune_clean"].apply(lambda x: locator.geocode(x)[1])
    ini_dataset["latitude"] = ini_dataset["location"].str[0]
    ini_dataset["longitude"] = ini_dataset["location"].str[1]
    return ini_dataset

In [7]:
entrepot = add_location(entrepot)

In [106]:
#locator.geocode("Boulogne sur mer 62200")

Location(Dannes, Boulogne-sur-Mer, Pas-de-Calais, Hauts-de-France, France métropolitaine, 62200, France, (50.59368681860731, 1.583020224543379, 0.0))

### K Means

In [17]:
from sklearn.cluster import KMeans
import geopy.distance
def run_kmeans(
    data,
    threshold_km=20,
    threshold_nb_areas_by_centroid=5,
) :

    data = data[["Identifiant aire logistique dense (e1)", "Région d'implantation", "location", "longitude", "latitude"]]
    region_list = list(data["Région d'implantation"].unique())

    final_dataset = pd.DataFrame()
    centroid_by_region = {}

    #Computing centroids region by region
    for region in region_list :
        dataset = data[data["Région d'implantation"] == region]
        
        # initialising limiting factors for KMeans
        areas_not_served = len(dataset)         
        nb_areas_by_centroid = len(dataset)
        
        # set the initial number of clusters at sufficient amount to satisfy max number of areas by station
        i = max(int(areas_not_served / threshold_nb_areas_by_centroid)+1, 1) 
        
        # if the region has only 1 area, run KMeans with 1 centroid
        threshold_not_served = int(0.2*len(dataset)) + 1 if len(dataset)>1 else 0
        
        while (areas_not_served > threshold_not_served) | (nb_areas_by_centroid > threshold_nb_areas_by_centroid) :
            kmeans = KMeans(
                init="random",
                n_clusters=i,
                n_init=10,
                max_iter=300,
                random_state=42
            )

            kmeans.fit(dataset[["latitude", "longitude"]])

            # Store the centroid information for each area
            dataset["centroid"] = kmeans.labels_
            dataset["centroid_coord"] = dataset["centroid"].apply(lambda x : kmeans.cluster_centers_[x])
            dataset["distance_to_centroid"] = dataset.apply(lambda x : geopy.distance.geodesic(x["location"], kmeans.cluster_centers_[x["centroid"]]).km, axis=1)
            
            # Increase nb of centroids by 1
            i += 1

            # Recompute limiting factors
            areas_not_served = sum(dataset["distance_to_centroid"]>threshold_km)
            nb_areas_by_centroid = max(dataset.groupby("centroid")["Identifiant aire logistique dense (e1)"].count().reset_index()["Identifiant aire logistique dense (e1)"])
        
        # Store the centroid information aside
        centroid_by_region[region] = kmeans.cluster_centers_
        
        # Build the final dataset with all information
        final_dataset = pd.concat([final_dataset, dataset], axis=0)

    return final_dataset, centroid_by_region

In [18]:
final_dataset, centroid_by_region = run_kmeans(entrepot)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [22]:
final_dataset.to_csv("final_all_logisic_areas.csv")

### Display output of algo

In [19]:
import numpy as np
centroids = np.concatenate(list(centroid_by_region.values()))

In [20]:
print("Number of centroids : ", len(centroids))

Number of centroids :  183


In [21]:
import plotly.express as px

fig = px.scatter_mapbox(final_dataset, 
                        lat="latitude", 
                        lon="longitude", 
                        zoom=8, 
                        height=800,
                        width=800)

fig.add_scattermapbox(lat = centroids[:,0]
                      ,lon = centroids[:,1]
                      ,marker_size = 6
                      ,marker_color = 'red'
#                       ,marker_symbol = 'star'
                      ,showlegend = True
                     )

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [23]:
# Number of areas by centroid
nb_areas_by_centroid = final_dataset.groupby(["Région d'implantation", "centroid"])["Identifiant aire logistique dense (e1)"].count().reset_index()

nb_areas_by_centroid["station_size"] = nb_areas_by_centroid["Identifiant aire logistique dense (e1)"].apply(lambda x : (x > 2) + (x > 4))

In [24]:
nb_areas_by_centroid.rename(columns = {"Identifiant aire logistique dense (e1)" : "area_count"}, inplace=True)

In [25]:
nb_areas_by_centroid.to_csv("station_size_all_areas.csv")

In [26]:
# Centroid coord by region
centroids_df = pd.DataFrame(
    [(k, i) for k, v in centroid_by_region.items() for i in v], 
    columns=['region', 'coord']
)

In [27]:
centroids_df.to_csv("centroids_coord_all_areas.csv")