In [1]:
import pandas as pd
from geopy import distance
import numpy as np

# ANALYSIS

In [2]:
# importing the cluster's dataframes
outliers = pd.read_csv('../data/outliers.csv', index_col = 0)
big_cluster = pd.read_csv('../data/big_cluster.csv', index_col = 0)

In [3]:
outliers.head()

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068,-1
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589,-1
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641,-1
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495925,-64.864078,-1
4,2020-10-18,Motul Grand Prix of Japan,Twin Ring Motegi,JAPAN,36.533647,140.229985,-1


In [4]:
big_cluster.head()

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters,Subcluster
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0,0
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0,2
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0,1
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0,0
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0,1


In [5]:
big_cluster['Subcluster'].value_counts()

1    5
0    4
2    3
Name: Subcluster, dtype: int64

#### I will start optimizing internal routes for subclusters.

## Optimizing routes

As the result of applying mlrose is in 'units' and the units of our coordinates are degrees of Latitude and Longitude, it is better to apply mlrose with a matrix of distances previously calculated in km with geopy.

In [6]:
subclusters = np.sort(big_cluster['Subcluster'].unique())

In [7]:
np.sort(subclusters)

array([0, 1, 2])

In [8]:
# creating new dataframes for subclusters, so after we can retrieve information
sub0 = big_cluster.loc[big_cluster['Subcluster'] == subclusters[0]].reset_index(drop=True)
sub1 = big_cluster.loc[big_cluster['Subcluster'] == subclusters[1]].reset_index(drop=True)
sub2 = big_cluster.loc[big_cluster['Subcluster'] == subclusters[2]].reset_index(drop=True)

In [9]:
from functions import create_dist_list

In [10]:
# creating dist lists for all subclusters, applying the function 'create_dist_list':
subcluster_dist_dict = {}
subcluster_dist_dict[0] = create_dist_list(sub0)
subcluster_dist_dict[1] = create_dist_list(sub1)
subcluster_dist_dict[2] = create_dist_list(sub2)

In [11]:
subcluster_dist_dict

{0: [(0, 1, 906.0416003864943),
  (0, 2, 709.20506184732),
  (0, 3, 576.0433647759187),
  (1, 2, 213.3824218268996),
  (1, 3, 335.7240942715326),
  (2, 3, 179.43584263435042)],
 1: [(0, 1, 753.2093789744465),
  (0, 2, 697.4292716963196),
  (0, 3, 445.16349820143387),
  (0, 4, 105.31854933139327),
  (1, 2, 331.6271130393199),
  (1, 3, 424.56157676462203),
  (1, 4, 752.9087233093135),
  (2, 3, 254.1048870356972),
  (2, 4, 650.6950903273602),
  (3, 4, 396.9993782118908)],
 2: [(0, 1, 710.3446224068731),
  (0, 2, 460.3195563371287),
  (1, 2, 521.1997578348936)]}

In [12]:
# Calculating best routes:
from functions import best_route

routes_dict = {}

for cluster in subclusters:
    best_order, best_dist = best_route(subcluster_dist_dict[cluster])
    routes_dict[cluster] = (best_order, best_dist)



In [13]:
routes_dict

{0: (array([3, 1, 2, 0]), 1834),
 1: (array([1, 0, 4, 3, 2]), 1841),
 2: (array([2, 1, 0]), 1692)}

### At this point, I have the best routes for the circuits inside the 3 subclusters. Now I will calculate the best route for the outliers

In [14]:
outliers_dist_list = create_dist_list(outliers)

In [15]:
outliers_dist_list

[(0, 1, 5486.3451291389265),
 (0, 2, 13027.280851968753),
 (0, 3, 13748.46427144216),
 (0, 4, 8270.183074011635),
 (0, 5, 12038.62218753436),
 (0, 6, 5934.373927069669),
 (0, 7, 4365.819538710465),
 (1, 2, 14545.997308130585),
 (1, 3, 18150.991002447245),
 (1, 4, 4384.353204179663),
 (1, 5, 7369.134897106193),
 (1, 6, 1357.7900737827379),
 (1, 7, 7839.468845562157),
 (2, 3, 7269.066592961213),
 (2, 4, 10479.282482273198),
 (2, 5, 14274.208157616871),
 (2, 6, 15827.81888198896),
 (2, 7, 8726.736191782999),
 (3, 4, 17459.966421628003),
 (3, 5, 12058.974243919298),
 (3, 6, 16923.7296460576),
 (3, 7, 12706.666915492708),
 (4, 5, 8323.338197187517),
 (4, 6, 5435.523367568314),
 (4, 7, 7663.212564868035),
 (5, 6, 6386.1405068187905),
 (5, 7, 15179.42626628677),
 (6, 7, 8943.668223441835)]

In [16]:
outliers_best_route = best_route(outliers_dist_list)

### At this point, I have all the distances between points inside each cluster, considering 'the outliers' like a cluster, but these are round distances.

### Now it's time to make them linear and calculate distances from one cluster to another and sum them all up.

### To do this, as I have one circuit in the 'outliers' which initally was considered in the first calculated cluster, I will consider this circuit as the last stage of the first route. And from this circuit, I will calculate the distance to the nearest circuit of the 3 subclusters.

### I will follow this method until I have all the circuits connected.

## STAGE 1

In [17]:
# looking for the index of the wanted last circuit, in outliers
s1_last = outliers.query('DBSCAN_Clusters == 0').index[0]

In [18]:
# importing needed functions
from functions import reordering_last
from functions import reordering_first
from functions import to_subtract

In [19]:
# applying functions:
s1 = reordering_last(outliers_best_route[0], s1_last)
s1_sub = to_subtract(s1, outliers_dist_list)

In [20]:
# calculating the distance considering s1 an open route:
s1_dist = outliers_best_route[1] - s1_sub

In [38]:
# creating the list for the whole calendar and the total length:
new_calendar = []
new_length = 0

# appending names for this stage
for i in s1:
    name = outliers.loc[i, 'Circuit']
    new_calendar.append(name)

# adding stage length
new_length += round(s1_dist)

In [21]:
# calculating the shortest distance from last point of stage 1 to the others
# this will determine the cluster for second stage

checking_dist = {}

for idx in big_cluster.index:
    start_lat = outliers.loc[s1_last,'Latitude']
    start_lon = outliers.loc[s1_last,'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = big_cluster.loc[idx,'Latitude']
    i_lon = big_cluster.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist[dist] = big_cluster.loc[idx,'Circuit']

In [22]:
# assigning the distance from stage 1 to stage 2 = min distance from last point of
# cluster 'outliers' and first point of next cluster.
s1_s2_dist = min(checking_dist.keys())

In [40]:
# updating new_lenght
new_length += round(s1_s2_dist)

## STAGE 2

In [23]:
# checking the name of the circuit for the next cluster
s2_first_name = checking_dist[s1_s2_dist]

In [24]:
# getting the index of the next cluster from circuit name
cluster_s2 = big_cluster.loc[big_cluster['Circuit'] == s2_first_name]['Subcluster'].values[0]

In [50]:
print(cluster_s2)

1


In [26]:
# getting the index of the subcluster for the first circuit of second cluster
s2_first = sub1.loc[sub1['Circuit'] == s2_first_name].index[0]

In [30]:
# applying functions to reorder and get distance to subtract:
s2 = reordering_first(routes_dict[cluster_s2][0], s2_first)
s2_sub = to_subtract(s2, subcluster_dist_dict[cluster_s2])

In [34]:
# calculating the distance considering s2 an open route:
s2_dist = routes_dict[cluster_s2][1] - s2_sub

In [41]:
# appending names for this stage
for i in s2:
    name = sub1.loc[i, 'Circuit']
    new_calendar.append(name)

# adding stage length
new_length += round(s2_dist)

In [44]:
# calculating the shortest distance from last point of stage 2 to the others
# this will determine the cluster for third stage

checking_dist = {}

for idx in big_cluster.loc[big_cluster['Subcluster'] != 1].index:
    start_lat = sub1.loc[s2[-1],'Latitude']
    start_lon = sub1.loc[s2[-1],'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = big_cluster.loc[idx,'Latitude']
    i_lon = big_cluster.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist[dist] = big_cluster.loc[idx,'Circuit']

In [46]:
# assigning the distance from stage 2 to stage 3 = min distance from last point of
# cluster 'sub1' and first point of next cluster.
s2_s3_dist = min(checking_dist.keys())

In [48]:
# updating new_lenght
new_length += round(s2_s3_dist)

## STAGE 3

In [51]:
# checking the name of the circuit for the next cluster
s3_first_name = checking_dist[s2_s3_dist]

# getting the index of the next cluster from circuit name
cluster_s3 = big_cluster.loc[big_cluster['Circuit'] == s3_first_name]['Subcluster'].values[0]

print(cluster_s3)

2


In [52]:
# getting the index of the subcluster for the first circuit of second cluster
s3_first = sub2.loc[sub2['Circuit'] == s3_first_name].index[0]

# applying functions to reorder and get distance to subtract:
s3 = reordering_first(routes_dict[cluster_s3][0], s3_first)
s3_sub = to_subtract(s3, subcluster_dist_dict[cluster_s3])

# calculating the distance considering s3 an open route:
s3_dist = routes_dict[cluster_s3][1] - s3_sub

In [54]:
# appending names for this stage
for i in s3:
    name = sub2.loc[i, 'Circuit']
    new_calendar.append(name)

# adding stage length
new_length += round(s3_dist)

In [56]:
# calculating the shortest distance from last point of stage 3 to the last cluster

checking_dist = {}

for idx in big_cluster.loc[big_cluster['Subcluster'] == 0].index:
    start_lat = sub2.loc[s2[-1],'Latitude']
    start_lon = sub2.loc[s2[-1],'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = big_cluster.loc[idx,'Latitude']
    i_lon = big_cluster.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist[dist] = big_cluster.loc[idx,'Circuit']

In [58]:
# assigning the distance from stage 3 to stage 4 = min distance from last point of
# cluster 'sub2' and first point of last cluster.
s3_s4_dist = min(checking_dist.keys())

# updating new_lenght
new_length += round(s3_s4_dist)

## STAGE 4

In [60]:
# checking the name of the circuit for the last cluster
s4_first_name = checking_dist[s3_s4_dist]

# getting the index of the next cluster from circuit name
cluster_s4 = big_cluster.loc[big_cluster['Circuit'] == s4_first_name]['Subcluster'].values[0]

print(cluster_s4) # should be 0!

0


In [61]:
# getting the index of the subcluster for the first circuit of second cluster
s4_first = sub0.loc[sub0['Circuit'] == s4_first_name].index[0]

# applying functions to reorder and get distance to subtract:
s4 = reordering_first(routes_dict[cluster_s4][0], s4_first)
s4_sub = to_subtract(s4, subcluster_dist_dict[cluster_s4])

# calculating the distance considering s3 an open route:
s4_dist = routes_dict[cluster_s4][1] - s4_sub

In [62]:
# appending names for this stage
for i in s4:
    name = sub0.loc[i, 'Circuit']
    new_calendar.append(name)

# adding stage length
new_length += round(s4_dist)

## LAST CHECKS

In [68]:
# the length of new_calendar should be = 20
len(new_calendar) == 20

True

In [67]:
# the values of new_calendar should be all unique
len(new_calendar) == len(set(new_calendar))

True

In [69]:
# the new distance should be lower than the first calculated: 84594 km
new_length < 84594

True

# FINAL RESULTS

## NEW CALENDAR

In [63]:
print('The optimized calendar should be:')
for item in new_calendar:
    print(item)

The optimized calendar should be:
Circuit Of The Americas
Termas de Río Hondo
Phillip Island
Twin Ring Motegi
Buriram International Circuit
Sepang International Circuit
Losail International Circuit
KymiRing
Sachsenring
Autodromo Internazionale del Mugello
Misano World Circuit Marco Simoncelli
Red Bull Ring - Spielberg
Automotodrom Brno
TT Circuit Assen
Le Mans
Silverstone Circuit
Circuit de Barcelona-Catalunya
MotorLand Aragón
Circuito de Jerez
Circuit Ricardo Tormo


## NEW DISTANCE

In [70]:
print('The total distance for the new calendar is:', new_length)

The total distance for the new calendar is: 51300
