In [2]:
import pandas as pd
from geopy import distance

# ANALYSIS

## The current calendar

In [3]:
# importing the DataFrame with the circuits, dates and coordinates
circuits = pd.read_csv('../data/races_cleaned.csv', index_col = 0)

In [4]:
circuits.head()

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078
4,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317


In [6]:
# Calculating the current distance. From one circuit to the previous one, and adding up
# each distance. This is a non-round distance:
current_distance = 0

for index in circuits.index[1:]:
    coord0 = (circuits.loc[index-1, 'Latitude'], circuits.loc[index-1, 'Longitude'])
    coord1 = (circuits.loc[index, 'Latitude'], circuits.loc[index, 'Longitude'])
    current_distance += round(distance.distance(coord0, coord1).km)

In [7]:
print('The current distance for the whole calendar is:', current_distance, 'km')

The current distance for the whole calendar is: 84594 km


#### Current distance is more than 84k km. The objective is reduce this distance

## Clustering Circuits

I will use Unsupervised Learning algorithms to cluster the circuits, and then optimizing the distance between clusters.

### Clustering with DBSCAN
The first clustering will be using DBSCAN, as I don't want to force any number of clusters.

In [8]:
# first of all I need to create a list of lists with the coordinates:

coordinates = []
for index in circuits.index:
    coord_list = [circuits.loc[index,'Latitude'], circuits.loc[index,'Longitude']]
    coordinates.append(coord_list)

In [9]:
# As the latitude goes from -90º to 90º and the longitude from -180º to 180º, first of
# all I will scale the coordinates to give them the same weight.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
coord_scale = scaler.fit_transform(coordinates)

In [10]:
coord_scale

array([[-0.27771554,  0.47350951],
       [-0.68962793,  1.35519357],
       [-0.09587076, -2.07229662],
       [-2.35086236, -1.5126826 ],
       [ 0.16062641, -0.51020018],
       [ 0.60325085, -0.40171664],
       [ 0.44643659, -0.21088379],
       [ 0.35136469, -0.36651784],
       [ 0.70988817, -0.19205361],
       [ 0.79701256, -0.29360566],
       [ 1.10694741,  0.04707124],
       [ 0.6503148 , -0.12416765],
       [ 0.57258376, -0.15302925],
       [ 0.76227548, -0.42239607],
       [ 0.44498223, -0.18848582],
       [ 0.33218514, -0.40860079],
       [ 0.15433498,  1.98946412],
       [-2.78127411,  2.07499567],
       [-1.16707273,  1.33216291],
       [ 0.27022037, -0.4157605 ]])

In [11]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5)
circuits_dbscan = dbscan.fit(coord_scale)
circuits['DBSCAN_Clusters'] = circuits_dbscan.labels_

In [12]:
circuits.groupby(['DBSCAN_Clusters']).size()

DBSCAN_Clusters
-1     7
 0    13
dtype: int64

#### There is a cluster with 13 items and 7 outliers

In [12]:
circuits.groupby(['DBSCAN_Clusters', 'Country']).size()

DBSCAN_Clusters  Country       
-1               ARGENTINA         1
                 AUSTRALIA         1
                 JAPAN             1
                 MALAYSIA          1
                 QATAR             1
                 THAILAND          1
                 UNITED STATES     1
 0               AUSTRIA           1
                 CZECH REPUBLIC    1
                 FINLAND           1
                 FRANCE            1
                 GERMANY           1
                 GREAT BRITAIN     1
                 ITALY             2
                 NETHERLANDS       1
                 SPAIN             4
dtype: int64

#### The distribuiton makes sense, as the outliers are far from each other while the items in the cluster are pretty close.

#### For further analysis, I will split them into two new dataframes according to the first clustering.

In [15]:
outliers = circuits.query('DBSCAN_Clusters == -1').reset_index(drop = True)

In [16]:
outliers

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068,-1
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589,-1
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641,-1
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078,-1
4,2020-10-18,Motul Grand Prix of Japan,Twin Ring Motegi,JAPAN,36.533647,140.229985,-1
5,2020-10-25,Australian Motorcycle Grand Prix,Phillip Island,AUSTRALIA,-38.496688,145.238917,-1
6,2020-11-01,Shell Malaysia Motorcycle Grand Prix,Sepang International Circuit,MALAYSIA,2.760191,101.736859,-1


In [17]:
big_cluster = circuits.query('DBSCAN_Clusters == 0').reset_index(drop = True)

In [18]:
big_cluster

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0
5,2020-06-28,Motul TT Assen,TT Circuit Assen,NETHERLANDS,52.959646,6.527974,0
6,2020-07-12,Finland Grand Prix,KymiRing,FINLAND,60.881176,26.478826,0
7,2020-08-09,Monster Energy Grand Prix České republiky,Automotodrom Brno,CZECH REPUBLIC,49.210243,16.450668,0
8,2020-08-16,myWorld Motorrad Grand Prix von Österreich,Red Bull Ring - Spielberg,AUSTRIA,47.223539,14.760464,0
9,2020-08-30,British Grand Prix,Silverstone Circuit,GREAT BRITAIN,52.071812,-1.014299,0


### Clustering with KMeans
In order to small calculations of distances, I will split the cluster with 13 items into several clusters of max 5 items each. This will simplify the optimizing calculation for the routes between the circuits into the cluster.

In [19]:
# creating the coordinates for circuits in big_cluster

cluster_coord = []
for index in big_cluster.index:
    coord_list = [big_cluster.loc[index,'Latitude'], big_cluster.loc[index,'Longitude']]
    cluster_coord.append(coord_list)

In [20]:
cluster_coord

[[36.69444715, -6.15631689958845],
 [48.00734979999999, 0.1967379],
 [43.99938220000001, 11.3723647068196],
 [41.56946855, 2.25806310666666],
 [50.7328604, 12.4751047884026],
 [52.95964605, 6.5279741969904395],
 [60.88117595, 26.478826152075502],
 [49.2102429, 16.4506683],
 [47.223539200000005, 14.7604645],
 [52.071811600000004, -1.01429912415686],
 [43.9622107, 12.6840429850135],
 [41.07926445, -0.206414553372675],
 [39.4955257, -0.6257045]]

In [21]:
# after different attemps, to get a max of 5 items per cluster, I need 4 clusters
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
small_clusters = kmeans.fit(cluster_coord)
small_clusters.labels_

array([1, 3, 0, 1, 0, 3, 2, 0, 0, 3, 0, 1, 1], dtype=int32)

In [22]:
# creating a new column to write the code of subcluster
big_cluster['Subcluster'] = small_clusters.labels_

In [23]:
big_cluster

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters,Subcluster
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0,1
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0,3
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0,0
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0,1
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0,0
5,2020-06-28,Motul TT Assen,TT Circuit Assen,NETHERLANDS,52.959646,6.527974,0,3
6,2020-07-12,Finland Grand Prix,KymiRing,FINLAND,60.881176,26.478826,0,2
7,2020-08-09,Monster Energy Grand Prix České republiky,Automotodrom Brno,CZECH REPUBLIC,49.210243,16.450668,0,0
8,2020-08-16,myWorld Motorrad Grand Prix von Österreich,Red Bull Ring - Spielberg,AUSTRIA,47.223539,14.760464,0,0
9,2020-08-30,British Grand Prix,Silverstone Circuit,GREAT BRITAIN,52.071812,-1.014299,0,3


#### I will start optimizing internal routes for subclusters 2 (5 circuits), 1 (4 circuits) and 3 (3 circuits).

## Optimizing routes

In [21]:
import mlrose



As the result of applying mlrose is in 'units' and the units of our coordinates are degrees of Latitude and Longitude, it is better to apply mlrose with a matrix of distances previously calculated in km with geopy.

### Subcluster 2

In [22]:
# creating a list with distances between circuits:
subcluster2 = circuits.query('Subcluster == 2').reset_index(drop=True)

dist_list2 = []

for i in subcluster2.index:
    for j in subcluster2.index:
        if i < j: 
            coord_i = (subcluster2.loc[i,'Latitude'],subcluster2.loc[i,'Longitude'])
            coord_j = (subcluster2.loc[j,'Latitude'],subcluster2.loc[j,'Longitude'])

            dist = distance.distance(coord_i, coord_j).km

            dist_list2.append((i, j, dist))

In [23]:
dist_list2

[]

In [24]:
# defining a function to apply all the steps for mlrose

import math

def route_calc(dist_list):
    # Initialize fitness function object using coords_sub0
    fitness_coords = mlrose.TravellingSales(distances = dist_list)
    
    # Calculating the number of circuits depending on the number of distances in list
    # The num of distances is equal to n * (n-1) / 2, being n the number of circuits.
    # we want to find n having the num of distances, this is a quadratic function:
    # x**2 - x - 2y = 0, being x = n and y = num of distances.
    # Solving the equation with math library:
    
    a = 1
    b = -1
    c = -2*len(dist_list)
    # calculate the discriminant
    d = (b**2) - (4*a*c)
    # find two solutions
    sol1 = (-b-math.sqrt(d))/(2*a)
    sol2 = (-b+math.sqrt(d))/(2*a)
    
    # assigning the positive solution to length, needed for the algorithm
    length = max(sol1, sol2)
    
    # Define optimization problem object
    problem_fit = mlrose.TSPOpt(length = length, fitness_fn = fitness_coords, 
                                maximize = False)
    
    # Solve using genetic algorithm - attempt 1
    best_state1, best_fitness1 = mlrose.genetic_alg(problem_fit, random_state = 2)
    
    # Solve using genetic algorithm - attempt 2
    best_state2, best_fitness2 = mlrose.genetic_alg(problem_fit, mutation_prob = 0.2, 
                                                  max_attempts = 100, random_state = 2)
    
    if best_fitness1 < best_fitness2:
        return best_state1, round(best_fitness1)
    else:
        return best_state2, round(best_fitness2)

In [25]:
best_order2, round_dist2 = route_calc(dist_list2)

ValueError: not enough values to unpack (expected 3, got 0)

In [None]:
best_order2

In [None]:
# creating the route with circuit names:
best_route2 = []
for i in best_order2:
    name = subcluster2.loc[i, 'Circuit']
    best_route2.append(name)

In [None]:
best_route2

#### This is a round route. When calculating the distance for the whole calendar, I will remove one step. By now, I will append the distance to the list.

In [None]:
best_route2.append(round(round_dist2))

In [None]:
best_route2

### Subcluster 1

In [None]:
# creating a list with distances between circuits:
subcluster1 = circuits.query('Subcluster == 1').reset_index(drop=True)

dist_list1 = []

for i in subcluster1.index:
    for j in subcluster1.index:
        if i < j: 
            coord_i = (subcluster1.loc[i,'Latitude'],subcluster1.loc[i,'Longitude'])
            coord_j = (subcluster1.loc[j,'Latitude'],subcluster1.loc[j,'Longitude'])

            dist = distance.distance(coord_i, coord_j).km

            dist_list1.append((i, j, dist))

In [None]:
dist_list1

In [None]:
# applying the function
best_order1, round_dist1 = route_calc(dist_list1)

In [None]:
best_order1

In [None]:
# creating the route with circuit names:
best_route1 = []
for i in best_order1:
    name = subcluster1.loc[i, 'Circuit']
    best_route1.append(name)

In [None]:
best_route1

In [None]:
# appending the distance
best_route1.append(round(round_dist1))

In [None]:
best_route1

### Subcluster 3

In [None]:
# creating a list with distances between circuits:
subcluster3 = circuits.query('Subcluster == 3').reset_index(drop=True)

dist_list3 = []

for i in subcluster3.index:
    for j in subcluster3.index:
        if i < j: 
            coord_i = (subcluster3.loc[i,'Latitude'],subcluster3.loc[i,'Longitude'])
            coord_j = (subcluster3.loc[j,'Latitude'],subcluster3.loc[j,'Longitude'])

            dist = distance.distance(coord_i, coord_j).km

            dist_list3.append((i, j, dist))

In [None]:
dist_list3

In [None]:
# applying the function:
best_order3, round_dist3 = route_calc(dist_list3)

In [None]:
best_order3

In [None]:
# creating the route with circuit names:
best_route3 = []
for i in best_order3:
    name = subcluster3.loc[i, 'Circuit']
    best_route3.append(name)

In [None]:
best_route3

In [None]:
# appending the round distance
best_route3.append(round(round_dist3))

In [None]:
best_route3

### At this point, I have the best routes for the circuits inside the 3 subclusters. Still pending 1 circuit alone in the 4th subclusters and 7 circuits with no subcluster

### I will consider this 8 circuits as a new subcluster and calculate the best route between them

In [None]:
# creating the new dataframe
other_circuits = circuits.loc[(circuits['Subcluster'] == 0) | (circuits['Subcluster'] == 'None')].reset_index(drop=True)

In [None]:
other_circuits

In [None]:
dist_list_other = []

for i in other_circuits.index:
    for j in other_circuits.index:
        if i < j: 
            coord_i = (other_circuits.loc[i,'Latitude'],other_circuits.loc[i,'Longitude'])
            coord_j = (other_circuits.loc[j,'Latitude'],other_circuits.loc[j,'Longitude'])

            dist = distance.distance(coord_i, coord_j).km

            dist_list_other.append((i, j, dist))

In [None]:
dist_list_other

In [None]:
# applying the function:
best_order_other, round_dist_other = route_calc(dist_list_other)

In [None]:
best_order_other

In [None]:
# creating the route with circuit names:
best_route_other = []
for i in best_order_other:
    name = other_circuits.loc[i, 'Circuit']
    best_route_other.append(name)

In [None]:
best_route_other

In [None]:
# appending the round distance
best_route_other.append(round(round_dist_other))

In [None]:
best_route_other

### At this point, I have all the distances between points inside each cluster, considering 'the outliers' like a cluster, but these are round distances.

### Now it's time to make them linear and calculate distances from one cluster to another and sum them all up.

### To do this, as I have one circuit in the 'outliers' which initally was considered in the first calculated cluster, I will consider this circuit as the last stage of the first route. And from this circuit, I will calculate the distance to the nearest circuit of the 3 subclusters.

### I will follow this method until I have all the circuits connected.

In [None]:
# defining functions to reorder the route according to last or first circuit
import numpy as np   

def reordering_last(array, index):
    position = int(np.where(array == index)[0]) + 1
    return np.roll(array, len(array)-position)

def reordering_first(array, index):
    position = int(np.where(array == index)[0])
    return np.roll(array, -position)

In [None]:
# defining a function to subtract the distance for the last stage of the round route.

def to_subtract(array, dist_list):
    for el in dist_list:
        if el[0] == array[0] and el[1] == array[-1]:
            return el[2]
        elif el[0] == array[-1] and el[1] == array[0]:
            return el[2]

In [None]:
# looking for the index of the wanted last circuit
last_circuit = other_circuits.query('Subcluster == 0').index[0]

In [None]:
# applying functions:
first_stage = reordering_last(best_order_other, last_circuit)
subtract0 = to_subtract(first_stage, dist_list_other)

In [None]:
# updating the route with circuit names:
first_stage_names = []
for i in first_stage:
    name = other_circuits.loc[i, 'Circuit']
    first_stage_names.append(name)

In [None]:
# appending the distance subtracting the last step
first_stage_names.append(round(round_dist_other - subtract0))

In [None]:
first_stage_names

### First stage is defined. Now I will look for the closest circuit to the last circuit of first stage. This will be the starting point for the second stage.

In [None]:
checking_dist = {}

for idx in circuits.query('Subcluster != "None"').index:
    start_lat = circuits.loc[circuits.loc[circuits['Circuit']== first_stage_names[-2]].index[0],'Latitude']
    start_lon = circuits.loc[circuits.loc[circuits['Circuit']== first_stage_names[-2]].index[0],'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = circuits.loc[idx,'Latitude']
    i_lon = circuits.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist[dist] = circuits.loc[idx,'Circuit']

In [None]:
# checking the closest circuit to the last step for the first stage:
next_circuit = checking_dist[min(checking_dist.keys())]

In [None]:
# looking for the cluster including the circuit
next_cluster = circuits.loc[circuits.loc[circuits['Circuit'] == next_circuit].index[0]\
                            ,'Subcluster']

Let's check 'next cluster' and reorder it if necessary, to get the route non-rounded

In [None]:
print('Next circuit should be',next_circuit,'from subcluster', next_cluster)

In [None]:
# checking the best order for subcluster 2
best_route2

In [None]:
# as 'Sachsenring' is not in the first place, I need to reorder. I will apply the function
# reorder_first to get that circuit in first place.
# I also will apply the subtract function to get the distance to subtract.

next_circuit_idx = subcluster2.loc[subcluster2['Circuit'] == next_circuit].index[0]
second_stage = reordering_first(best_order2, next_circuit_idx)
subtract2 = to_subtract(second_stage, dist_list2)

In [None]:
# updating the route with circuit names:
second_stage_names = []
for i in second_stage:
    name = subcluster2.loc[i, 'Circuit']
    second_stage_names.append(name)

In [None]:
second_stage_names

In [None]:
# appending the distance subtracting the last step
second_stage_names.append(round(round_dist2 - subtract2))

In [None]:
second_stage_names

### Second stage is now defined. Again, turn to look for the closest circuit to the last circuit of second stage. This will be the starting point for the third stage.

In [None]:
checking_dist2 = {}

for idx in circuits.loc[(circuits['Subcluster'] == 1) | (circuits['Subcluster'] == 3)].index:
    start_lat = circuits.loc[circuits.loc[circuits['Circuit']== second_stage_names[-2]].index[0],'Latitude']
    start_lon = circuits.loc[circuits.loc[circuits['Circuit']== second_stage_names[-2]].index[0],'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = circuits.loc[idx,'Latitude']
    i_lon = circuits.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist2[dist] = circuits.loc[idx,'Circuit']

In [None]:
# checking the closest circuit to the last step for the second stage:
next_circuit2 = checking_dist2[min(checking_dist2.keys())]

In [None]:
# looking for the cluster including the circuit
next_cluster2 = circuits.loc[circuits.loc[circuits['Circuit'] == next_circuit2].index[0]\
                            ,'Subcluster']

In [None]:
print('Next circuit should be',next_circuit2,'from subcluster', next_cluster2)

In [None]:
# checking the best order for subcluster 1
best_route1

In [None]:
# again it is necessary to reorder.
next_circuit2_idx = subcluster1.loc[subcluster1['Circuit'] == next_circuit2].index[0]
third_stage = reordering_first(best_order1, next_circuit2_idx)
subtract1 = to_subtract(third_stage, dist_list1)

In [None]:
# updating the route with circuit names:
third_stage_names = []
for i in third_stage:
    name = subcluster1.loc[i, 'Circuit']
    third_stage_names.append(name)

In [None]:
third_stage_names

In [None]:
# appending the distance subtracting the last step
third_stage_names.append(round(round_dist1 - subtract1))

In [None]:
third_stage_names

### Third stage defined. One more step to get the whole distances.

In [None]:
checking_dist3 = {}

for idx in circuits.loc[circuits['Subcluster'] == 3].index:
    start_lat = circuits.loc[circuits.loc[circuits['Circuit']== third_stage_names[-2]].index[0],'Latitude']
    start_lon = circuits.loc[circuits.loc[circuits['Circuit']== third_stage_names[-2]].index[0],'Longitude']
    start_coord = (start_lat, start_lon)
    
    i_lat = circuits.loc[idx,'Latitude']
    i_lon = circuits.loc[idx,'Longitude']
    i_coord = (i_lat, i_lon)
    
    dist = distance.distance(start_coord, i_coord).km
    if dist > 0:
        checking_dist3[dist] = circuits.loc[idx,'Circuit']

In [None]:
checking_dist3

In [None]:
# checking the closest circuit to the last step for the third stage:
next_circuit3 = checking_dist3[min(checking_dist3.keys())]

In [None]:
# looking for the cluster including the circuit. Should be cluster 3, just to confirm
next_cluster3 = circuits.loc[circuits.loc[circuits['Circuit'] == next_circuit3].index[0]\
                            ,'Subcluster']

In [None]:
next_cluster3

In [None]:
print('Next circuit should be',next_circuit3,'from subcluster', next_cluster3)

In [None]:
# checking the best order for subcluster 3
best_route3

In [None]:
# once more, it is necessary to reorder.
next_circuit3_idx = subcluster3.loc[subcluster3['Circuit'] == next_circuit3].index[0]
fourth_stage = reordering_first(best_order3, next_circuit3_idx)
subtract3 = to_subtract(fourth_stage, dist_list3)

In [None]:
# updating the route with circuit names:
fourth_stage_names = []
for i in fourth_stage:
    name = subcluster3.loc[i, 'Circuit']
    fourth_stage_names.append(name)

In [None]:
fourth_stage_names

In [None]:
# appending the distance subtracting the last step
fourth_stage_names.append(round(round_dist3 - subtract3))

In [None]:
fourth_stage_names