<a href="https://colab.research.google.com/github/IanGZC/Diplomado-/blob/main/target_locations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import DBSCAN

In [None]:
d=pd.read_csv('/content/target-locations (1).csv')
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1854 entries, 0 to 1853
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       1854 non-null   object 
 1   latitude   1854 non-null   float64
 2   longitude  1854 non-null   float64
 3   address    1854 non-null   object 
 4   phone      1854 non-null   object 
 5   website    1854 non-null   object 
dtypes: float64(2), object(4)
memory usage: 87.0+ KB


In [None]:
df=d.drop(columns=['name','address','phone','website'])
df

Unnamed: 0,latitude,longitude
0,33.224895,-86.803977
1,32.607518,-85.482037
2,33.334428,-86.990326
3,30.603217,-87.896507
4,34.559064,-86.971030
...,...,...
1849,43.034537,-88.177573
1850,42.989740,-88.259200
1851,42.847131,-106.264667
1852,41.161666,-104.799678


In [None]:
fig = px.scatter_mapbox(df, lat='latitude', lon='longitude',
                        mapbox_style='open-street-map', zoom = 1)
fig.update_layout(autosize=True)
fig.show()

In [None]:
X=df.copy()
wcss = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters = k, random_state = 42, n_init = 10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, 11)), y=wcss, mode='lines+markers', name='WCSS'))
fig.update_layout(title='Elbow Method for Optimal K',
                  xaxis_title='Number of Clusters (K)',
                  yaxis_title='WCSS',
                  xaxis=dict(tickmode='linear', tickvals=list(range(1, 11))),
                  yaxis=dict(rangemode='tozero'),
                  template = 'plotly_white')
fig.show()

In [None]:
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init = 10)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, cluster_labels))

fig = go.Figure()
fig.add_trace(go.Scatter(x = list(range(2, 11)), y = silhouette_scores, mode = 'lines+markers', name = 'Silhouette Score'))
fig.update_layout(title = 'Silhouette Score for Optimal K',
                  xaxis_title = 'Number of Clusters (K)',
                  yaxis_title = 'Silhouette Score',
                  xaxis = dict(tickmode='linear', tickvals=list(range(2, 11))),
                  yaxis = dict(rangemode='tozero'),
                  template = 'plotly_white')
fig.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init = 10)
kmeans.fit(X)
tar_c3 = df.copy()
tar_c3['cluster'] = kmeans.labels_
fig = px.scatter_mapbox(tar_c3, lat='latitude', lon='longitude', color='cluster',
                        mapbox_style='open-street-map', zoom = 1)
fig.update_layout(autosize=True)
fig.show()

In [None]:
dbscan = DBSCAN(eps=0.2, min_samples=5)
clusters = dbscan.fit_predict(X)

tar_dbs = df.copy()
tar_dbs['cluster'] = clusters
tar_dbs['cluster'] = tar_dbs.cluster.astype(str)
fig = px.scatter_mapbox(tar_dbs, lon = 'longitude', lat = 'latitude', color = 'cluster', template = 'plotly_white', mapbox_style='open-street-map', zoom = 2, title = 'DBSCAN Clustering')
fig.show()

In [None]:
tar_dbs.nunique()

latitude     1838
longitude    1838
cluster        65
dtype: int64

In [None]:
tar2=pd.DataFrame(tar_dbs.loc[tar_dbs['cluster']=='-1'])
tar2

Unnamed: 0,latitude,longitude,cluster
1,32.607518,-85.482037,-1
3,30.603217,-87.896507,-1
4,34.559064,-86.971030,-1
5,31.266492,-85.446808,-1
6,34.842130,-87.634764,-1
...,...,...,...
1843,44.490590,-88.070666,-1
1844,44.465351,-87.985999,-1
1851,42.847131,-106.264667,-1
1852,41.161666,-104.799678,-1


In [None]:
dbs=tar_dbs.drop(tar_dbs.loc[tar_dbs['cluster']=='-1'].index)
fig = px.scatter_mapbox(dbs, lon = 'longitude', lat = 'latitude', color = 'cluster', template = 'plotly_white', mapbox_style='open-street-map', zoom = 2,title = 'DBSCAN Clustering')
fig.show()

In [None]:
centroids=pd.DataFrame(dbs.groupby(['cluster']).mean()).reset_index()
centroids

Unnamed: 0,cluster,latitude,longitude
0,0,33.432143,-86.796650
1,1,33.482313,-111.990942
2,10,41.278187,-73.058871
3,11,41.700384,-72.773878
4,12,28.537293,-81.426823
...,...,...,...
59,62,43.018276,-88.049326
60,63,43.085936,-89.405192
61,7,32.872991,-117.106577
62,8,36.817521,-119.770044


In [None]:
fig = go.Figure()
for k in sorted(dbs.cluster.unique()):
  fig.add_trace(go.Scatter(
        x = dbs[dbs.cluster == k].longitude,
        y = dbs[dbs.cluster == k].latitude,
        mode = 'markers',
        name = f'Cluster {k}' ,
        ))
fig.add_trace(go.Scatter(
        x = centroids.longitude,
        y = centroids.latitude,
        mode = 'markers',
        name = 'Centroids',
        marker_color = 'Black',
        marker_size = 2
        ))
fig.update_layout(
    template = 'plotly_white'
)
fig.show()

In [None]:
Z=tar2.drop(columns=['cluster'])
Z
dbscann = DBSCAN(eps=1, min_samples=5)
clusterss = dbscann.fit_predict(Z)

tar_dbss = Z.copy()
tar_dbss['cluster'] = clusterss
tar_dbss['cluster'] = tar_dbss.cluster.astype(str)
fig = px.scatter_mapbox(tar_dbss, lon = 'longitude', lat = 'latitude', color = 'cluster', template = 'plotly_white', mapbox_style='open-street-map', zoom = 1.5, title = 'DBSCAN Clustering')
fig.show()

In [None]:
tar_dbss.nunique()

latitude     451
longitude    451
cluster       31
dtype: int64

In [None]:
centroidss=pd.DataFrame(tar_dbss.groupby(['cluster']).mean()).reset_index()
centroidss

Unnamed: 0,cluster,latitude,longitude
0,-1,38.029708,-99.522563
1,0,34.187591,-83.719433
2,1,30.452517,-87.261513
3,10,28.249369,-81.58594
4,11,40.856471,-90.116846
5,12,41.576556,-84.733796
6,13,39.196857,-94.744436
7,14,30.192545,-90.600997
8,15,44.699992,-93.237899
9,16,31.889606,-106.455534


In [None]:
tar3=pd.DataFrame(tar_dbss.loc[tar_dbss['cluster']=='-1'])

In [None]:
tar_dbss=tar_dbss.drop(tar_dbss.loc[tar_dbss['cluster']=='-1'].index)
fig = go.Figure()
for k in sorted(tar_dbss.cluster.unique()):
  fig.add_trace(go.Scatter(
        x = tar_dbss[tar_dbss.cluster == k].longitude,
        y = tar_dbss[tar_dbss.cluster == k].latitude,
        mode = 'markers',
        name = f'Cluster {k}' ,
        ))
fig.add_trace(go.Scatter(
        x = centroidss.longitude,
        y = centroidss.latitude,
        mode = 'markers',
        name = 'Centroids',
        marker_color = 'Black',
        marker_size = 4
        ))
fig.update_layout(
    template = 'plotly_white'
)
fig.show()

In [None]:
J=tar3.drop(columns=['cluster'])
dbscannn = DBSCAN(eps=0.5, min_samples=5)
clustersss = dbscannn.fit_predict(J)

ta = J.copy()
ta['cluster'] = clustersss
ta['cluster'] = ta.cluster.astype(str)
fig = px.scatter_mapbox(ta, lon = 'longitude', lat = 'latitude', color = 'cluster', template = 'plotly_white', mapbox_style='open-street-map', zoom = 1.5, title = 'DBSCAN Clustering')
fig.show()

Conclusiones:

El DBScan es un método que resultó de gran utilidad dada la forma de l a distribución en el mapa. Se observa que k-Means no es un método que pueda ser útil para esta aplicación por la manera en que distribuye los cluster, así como por el requerimiento computacional que implicaría poner 50 (por el número de estados en EUA) clusters o más. Sin embargo, se observó que muchos puntos fueron omitidos, por lo que se consideró conveniente aplicar nuevamente DBSCAN a los puntos que el método marcó como -1. Se realizó dos veces y a la tercera, pese a variar el epsilon, el número de clusters se mantuvo en 1. Además, para el primer clustering se observó que fue más conveniente usar un epsilon menor que para el segundo, donde se usó uno mucho mayor.

Por último, se consideró que el mejor punto de distribución estaría en el centroide de cada cluster, es decir, el promedio de todos los puntos de longitud y latitud correspondientes al mismo. Sin embargo, esto luce dudoso ya que la distancia real entre puntos puede ser mucho mayor de lo que podría parecer en el mapa. Finalizando, se debe considerar que para Alaska y las islas que están en el mar, parecen no tener mucha demanda, por lo que probablemente sería conveniente agregar puntos de distribución pequeños en los mismos.