In [31]:
import pandas as pd
from geopy import distance

# ANALYSIS

In [None]:
"""
Travelling Salesman Problem has (n-1)! / 2 solutions. For 20 races, that means 6,08e+16
solutions.

First approach: clustering.
"""

## The current calendar

In [36]:
circuits = pd.read_csv('../data/races_cleaned.csv', index_col = 0)

In [37]:
circuits.head()

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068
1,2020-03-22,OR Thailand Grand Prix,Chang International Circuit,THAILAND,2.760191,101.736859
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078
4,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317


In [34]:
# Now I can calculate the current distance:
current_distance = 0

for index in circuits.index[1:]:
    coord0 = (circuits.loc[index-1, 'Latitude'], circuits.loc[index-1, 'Longitude'])
    coord1 = (circuits.loc[index, 'Latitude'], circuits.loc[index, 'Longitude'])
    current_distance += round(distance.distance(coord0, coord1).km)

In [35]:
print('The current distance for the whole calendar is:', current_distance, 'km')

The current distance for the whole calendar is: 86324 km


#### Current distance is more than 86k km. The objective is minimize this distance

## First approach. Clustering Circuits

I will use Unsupervised Learning algorithms to cluster the circuits, and then optimizing the distance between clusters.

### Clustering with KMeans
Looking ath the map, seems obvious to cluster the circuits by something similar to continents, so I will look for 3 clusters

In [117]:
# first of all I need to create a list of lists with the coordinates:
coordinates = []
for index in circuits.index:
    coord_list = [circuits.loc[index,'Latitude'], circuits.loc[index,'Longitude']]
    coordinates.append(coord_list)

In [118]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
circuits_clusters = kmeans.fit(coordinates)
circuits['KM_Clusters'] = circuits_clusters.labels_

In [119]:
circuits.groupby(['KM_Clusters', 'Country']).size()

KM_Clusters  Country       
0            ARGENTINA         1
             UNITED STATES     1
1            AUSTRIA           1
             CZECH REPUBLIC    1
             FINLAND           1
             FRANCE            1
             GERMANY           1
             GREAT BRITAIN     1
             ITALY             2
             NETHERLANDS       1
             QATAR             1
             SPAIN             4
2            AUSTRALIA         1
             JAPAN             1
             MALAYSIA          1
             THAILAND          1
dtype: int64

In [120]:
cluster_centers = circuits_clusters.cluster_centers_

In [121]:
cluster0 = (cluster_centers[0][0], cluster_centers[0][1])
cluster1 = (cluster_centers[1][0], cluster_centers[1][1])
cluster2 = (cluster_centers[2][0], cluster_centers[2][1])

In [122]:
dist_0_1 = round(distance.distance(cluster0, cluster1).km)
dist_1_2 = round(distance.distance(cluster1, cluster2).km)
dist_2_0 = round(distance.distance(cluster2, cluster0).km)

In [123]:
print('Distance from cluster 0 to cluster 1 is:', dist_0_1, 'km')
print('Distance from cluster 1 to cluster 2 is:', dist_0_1, 'km')
print('Distance from cluster 2 to cluster 0 is:', dist_0_1, 'km')

Distance from cluster 0 to cluster 1 is: 10042 km
Distance from cluster 1 to cluster 2 is: 10042 km
Distance from cluster 2 to cluster 0 is: 10042 km


#### The clustering obtained makes sense regarding the countries in each cluster

### Clustering with DBSCAN
Let's check if DBSCAN gives other clustering

In [87]:
# As the latitude goes from -90º to 90º and the longitude from -180º to 180º, first of
# all I will scale the coordinates to give them the same weight.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
coord_scale = scaler.fit_transform(coordinates)

In [98]:
coord_scale

array([[-0.24846584,  0.47539751],
       [-1.11898273,  1.33538315],
       [-0.07047329, -2.07435859],
       [-2.27769475, -1.51387629],
       [ 0.18059019, -0.50983846],
       [ 0.613838  , -0.4011866 ],
       [ 0.46034573, -0.21005766],
       [ 0.36728784, -0.3659332 ],
       [ 0.7182163 , -0.19119827],
       [ 0.80349503, -0.29290789],
       [ 1.10686416,  0.04829759],
       [ 0.65990494, -0.12320698],
       [ 0.58382058, -0.15211336],
       [ 0.76949383, -0.42189812],
       [ 0.45892218, -0.18762495],
       [ 0.3485146 , -0.40808144],
       [ 0.17443205,  1.9937042 ],
       [-2.69898859,  2.07936847],
       [-1.11898273,  1.33538315],
       [ 0.2878625 , -0.41525226]])

In [105]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5)
circuits_dbscan = dbscan.fit(coord_scale)
circuits['DBSCAN_Clusters'] = circuits_dbscan.labels_

In [106]:
circuits.groupby(['DBSCAN_Clusters', 'Country']).size()

DBSCAN_Clusters  Country       
-1               ARGENTINA         1
                 AUSTRALIA         1
                 JAPAN             1
                 MALAYSIA          1
                 QATAR             1
                 THAILAND          1
                 UNITED STATES     1
 0               AUSTRIA           1
                 CZECH REPUBLIC    1
                 FINLAND           1
                 FRANCE            1
                 GERMANY           1
                 GREAT BRITAIN     1
                 ITALY             2
                 NETHERLANDS       1
                 SPAIN             4
dtype: int64