# Imports

In [3]:
import folium
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno



Load the dataset

In [4]:
dataset_starbucks = "../dataset/starbucks-us.csv"

# Load the dataset
df = pd.read_csv(dataset_starbucks, sep = ',')

# Color palette

Generate a color palette to be used with folium markers.

In [5]:
import random

def rand_255() -> int:
    """Generate a random number between 0 and 255."""
    return random.randint(0, 255)

# Init the color list
colors = []

# Populate the list with 200 random colors
for i in range(200):
    colors.append('#%02X%02X%02X' % (rand_255(), rand_255(), rand_255()))

In [6]:
def set_color(index: int) -> str:
    """Return the color corresponding to a given intex"""
    
    return colors[index]

# Clustering

In [7]:
df.head(3)

Unnamed: 0,Brand,Store Name,Ownership Type,Street Address,City,State/Province,Phone Number,Longitude,Latitude
0,Starbucks,Safeway-Anchorage #1809,Licensed,5600 Debarr Rd Ste 9,Anchorage,AK,907-339-0900,-149.78,61.21
1,Starbucks,Safeway-Anchorage #2628,Licensed,1725 Abbott Rd,Anchorage,AK,907-339-2800,-149.84,61.14
2,Starbucks,Safeway - Anchorage #1813,Licensed,1501 Huffman Rd,Anchorage,AK,907-339-1300,-149.85,61.11


## Metrics dataframe

We'll create a dataframe with only the relevant metrics for the clustering: **Longitude** and **Latitude**.

In [8]:
df_metrics = df[['Longitude', 'Latitude']]

## KMeans clustering

In [8]:
from sklearn.cluster import KMeans

# Create a KMeans object
kmeans = KMeans(n_clusters=60, init='k-means++', random_state=42)

Cluster the data, save the created labels and append them to a new dataframe

In [9]:
# Cluster the data
kmeans.fit(df_metrics)

# Save the created labels
df_labels = pd.DataFrame(kmeans.labels_, columns=['cluster'])

# Add the colors
df_labels['color'] = df_labels['cluster'].apply(set_color)

# Create a new dataframe with the metrics data labeled
df_kmeans_clustered = df_metrics.join(df_labels)

Custers vizualisation:

In [12]:
# Create a folium map, centered on United States
m = folium.Map(
    location=[37.6, -95.665],
    zoom_start=4
)

In [13]:
_ = df_kmeans_clustered.apply(lambda row: folium.CircleMarker(
    location=[row["Latitude"], row["Longitude"]],
    radius=1,
    weight=3,
    color=row['color'],
    fill_color=row['color']
).add_to(m), axis=1)

In [14]:
m

KMeans seems to create nice cluster when the points are nearby. However, it cluster also some points that are very far from each other.

## Affinity propagation

In [17]:
from sklearn.cluster import SpectralClustering

# Create a KMeans object
spectral = SpectralClustering(n_clusters=60, random_state=42)

In [None]:
# Cluster the data
spectral.fit(df_metrics)

In [None]:
# Save the created labels
df_labels = pd.DataFrame(spectral.labels_, columns=['cluster'])

# Add the colors
df_labels['color'] = df_labels['cluster'].apply(set_color)

# Create a new dataframe with the metrics data labeled
df_m_shift_clustered = df_metrics.join(df_labels)

In [None]:
# Create a folium map, centered on United States
m = folium.Map(
    location=[37.6, -95.665],
    zoom_start=4
)

In [None]:
_ = df_m_shift_clustered.apply(lambda row: folium.CircleMarker(
    location=[row["Latitude"], row["Longitude"]],
    radius=1,
    weight=3,
    color=row['color'],
    fill_color=row['color']
).add_to(m), axis=1)

In [None]:
m