In [1]:
import pandas as pd
from geopy import distance

# ANALYSIS

## The current calendar

In [2]:
# importing the DataFrame with the circuits, dates and coordinates
circuits = pd.read_csv('../data/races_cleaned.csv', index_col = 0)

In [3]:
circuits.head()

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078
4,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317


In [4]:
# Calculating the current distance. From one circuit to the previous one, and adding up
# each distance. This is a non-round distance:
current_distance = 0

for index in circuits.index[1:]:
    coord0 = (circuits.loc[index-1, 'Latitude'], circuits.loc[index-1, 'Longitude'])
    coord1 = (circuits.loc[index, 'Latitude'], circuits.loc[index, 'Longitude'])
    current_distance += round(distance.distance(coord0, coord1).km)

In [5]:
print('The current distance for the whole calendar is:', current_distance, 'km')

The current distance for the whole calendar is: 84594 km


#### Current distance is more than 84k km. The objective is reduce this distance

## Clustering Circuits

I will use Unsupervised Learning algorithms to cluster the circuits, and then optimizing the distance between clusters.

### Clustering with DBSCAN
The first clustering will be using DBSCAN, as I don't want to force any number of clusters.

In [6]:
# first of all I need to create a list of lists with the coordinates:

coordinates = []
for index in circuits.index:
    coord_list = [circuits.loc[index,'Latitude'], circuits.loc[index,'Longitude']]
    coordinates.append(coord_list)

In [7]:
# As the latitude goes from -90º to 90º and the longitude from -180º to 180º, first of
# all I will scale the coordinates to give them the same weight.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
coord_scale = scaler.fit_transform(coordinates)

In [8]:
coord_scale

array([[-0.27771554,  0.47350951],
       [-0.68962793,  1.35519357],
       [-0.09587076, -2.07229662],
       [-2.35086236, -1.5126826 ],
       [ 0.16062641, -0.51020018],
       [ 0.60325085, -0.40171664],
       [ 0.44643659, -0.21088379],
       [ 0.35136469, -0.36651784],
       [ 0.70988817, -0.19205361],
       [ 0.79701256, -0.29360566],
       [ 1.10694741,  0.04707124],
       [ 0.6503148 , -0.12416765],
       [ 0.57258376, -0.15302925],
       [ 0.76227548, -0.42239607],
       [ 0.44498223, -0.18848582],
       [ 0.33218514, -0.40860079],
       [ 0.15433498,  1.98946412],
       [-2.78127411,  2.07499567],
       [-1.16707273,  1.33216291],
       [ 0.27022037, -0.4157605 ]])

In [9]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5)
circuits_dbscan = dbscan.fit(coord_scale)
circuits['DBSCAN_Clusters'] = circuits_dbscan.labels_

In [10]:
circuits.groupby(['DBSCAN_Clusters']).size()

DBSCAN_Clusters
-1     7
 0    13
dtype: int64

#### There is a cluster with 13 items and 7 outliers

In [11]:
circuits.groupby(['DBSCAN_Clusters', 'Country']).size()

DBSCAN_Clusters  Country       
-1               ARGENTINA         1
                 AUSTRALIA         1
                 JAPAN             1
                 MALAYSIA          1
                 QATAR             1
                 THAILAND          1
                 UNITED STATES     1
 0               AUSTRIA           1
                 CZECH REPUBLIC    1
                 FINLAND           1
                 FRANCE            1
                 GERMANY           1
                 GREAT BRITAIN     1
                 ITALY             2
                 NETHERLANDS       1
                 SPAIN             4
dtype: int64

#### The distribuiton makes sense, as the outliers are far from each other while the items in the cluster are pretty close.

#### For further analysis, I will split them into two new dataframes according to the first clustering.

In [12]:
outliers = circuits.query('DBSCAN_Clusters == -1').reset_index(drop = True)

In [13]:
outliers

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068,-1
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589,-1
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641,-1
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078,-1
4,2020-10-18,Motul Grand Prix of Japan,Twin Ring Motegi,JAPAN,36.533647,140.229985,-1
5,2020-10-25,Australian Motorcycle Grand Prix,Phillip Island,AUSTRALIA,-38.496688,145.238917,-1
6,2020-11-01,Shell Malaysia Motorcycle Grand Prix,Sepang International Circuit,MALAYSIA,2.760191,101.736859,-1


In [14]:
big_cluster = circuits.query('DBSCAN_Clusters == 0').reset_index(drop = True)

In [15]:
big_cluster

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0
5,2020-06-28,Motul TT Assen,TT Circuit Assen,NETHERLANDS,52.959646,6.527974,0
6,2020-07-12,Finland Grand Prix,KymiRing,FINLAND,60.881176,26.478826,0
7,2020-08-09,Monster Energy Grand Prix České republiky,Automotodrom Brno,CZECH REPUBLIC,49.210243,16.450668,0
8,2020-08-16,myWorld Motorrad Grand Prix von Österreich,Red Bull Ring - Spielberg,AUSTRIA,47.223539,14.760464,0
9,2020-08-30,British Grand Prix,Silverstone Circuit,GREAT BRITAIN,52.071812,-1.014299,0


### Clustering with KMeans
In order to small calculations of distances, I will split the cluster with 13 items into several clusters of max 5 items each. This will simplify the optimizing calculation for the routes between the circuits into the cluster.

In [16]:
# creating the coordinates for circuits in big_cluster

cluster_coord = []
for index in big_cluster.index:
    coord_list = [big_cluster.loc[index,'Latitude'], big_cluster.loc[index,'Longitude']]
    cluster_coord.append(coord_list)

In [17]:
cluster_coord

[[36.69444715, -6.15631689958845],
 [48.00734979999999, 0.1967379],
 [43.99938220000001, 11.3723647068196],
 [41.56946855, 2.25806310666666],
 [50.7328604, 12.4751047884026],
 [52.95964605, 6.5279741969904395],
 [60.88117595, 26.478826152075502],
 [49.2102429, 16.4506683],
 [47.223539200000005, 14.7604645],
 [52.071811600000004, -1.01429912415686],
 [43.9622107, 12.6840429850135],
 [41.07926445, -0.206414553372675],
 [39.4955257, -0.6257045]]

In [18]:
# after different attemps, to get a max of 5 items per cluster, I need 4 clusters
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
small_clusters = kmeans.fit(cluster_coord)
small_clusters.labels_

array([0, 2, 1, 0, 1, 2, 3, 1, 1, 2, 1, 0, 0], dtype=int32)

In [19]:
# creating a new column to write the code of subcluster
big_cluster['Subcluster'] = small_clusters.labels_

In [20]:
big_cluster

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters,Subcluster
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0,0
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0,2
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0,1
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0,0
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0,1
5,2020-06-28,Motul TT Assen,TT Circuit Assen,NETHERLANDS,52.959646,6.527974,0,2
6,2020-07-12,Finland Grand Prix,KymiRing,FINLAND,60.881176,26.478826,0,3
7,2020-08-09,Monster Energy Grand Prix České republiky,Automotodrom Brno,CZECH REPUBLIC,49.210243,16.450668,0,1
8,2020-08-16,myWorld Motorrad Grand Prix von Österreich,Red Bull Ring - Spielberg,AUSTRIA,47.223539,14.760464,0,1
9,2020-08-30,British Grand Prix,Silverstone Circuit,GREAT BRITAIN,52.071812,-1.014299,0,2


In [21]:
big_cluster['Subcluster'].value_counts()

1    5
0    4
2    3
3    1
Name: Subcluster, dtype: int64

#### As seen in the subclusters, there is a group with only 1 circuit. That means that in fact, we can consider this circuit as an outlier as well. I will drop this row from the big_cluster and append it into the otuliers dataframe.

In [22]:
outliers = outliers.append(big_cluster.query('Subcluster == 3')\
                           [outliers.columns]).reset_index(drop = True)

In [23]:
outliers

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters
0,2020-03-08,Grand Prix of Qatar,Losail International Circuit,QATAR,25.491,51.452068,-1
1,2020-03-22,OR Thailand Grand Prix,Buriram International Circuit,THAILAND,14.963057,103.085589,-1
2,2020-04-05,Red Bull Grand Prix of The Americas,Circuit Of The Americas,UNITED STATES,30.138715,-97.63641,-1
3,2020-04-19,Gran Premio Motul de la República Argentina,Termas de Río Hondo,ARGENTINA,-27.495926,-64.864078,-1
4,2020-10-18,Motul Grand Prix of Japan,Twin Ring Motegi,JAPAN,36.533647,140.229985,-1
5,2020-10-25,Australian Motorcycle Grand Prix,Phillip Island,AUSTRALIA,-38.496688,145.238917,-1
6,2020-11-01,Shell Malaysia Motorcycle Grand Prix,Sepang International Circuit,MALAYSIA,2.760191,101.736859,-1
7,2020-07-12,Finland Grand Prix,KymiRing,FINLAND,60.881176,26.478826,0


In [24]:
big_cluster.drop(big_cluster.query('Subcluster == 3').index,inplace = True)

In [25]:
big_cluster.reset_index(drop = True)

Unnamed: 0,Date,Race,Circuit,Country,Latitude,Longitude,DBSCAN_Clusters,Subcluster
0,2020-05-03,Gran Premio Red Bull de España,Circuito de Jerez,SPAIN,36.694447,-6.156317,0,0
1,2020-05-17,SHARK Helmets Grand Prix de France,Le Mans,FRANCE,48.00735,0.196738,0,2
2,2020-05-31,Gran Premio d'Italia Oakley,Autodromo Internazionale del Mugello,ITALY,43.999382,11.372365,0,1
3,2020-06-07,Gran Premi Monster Energy de Catalunya,Circuit de Barcelona-Catalunya,SPAIN,41.569469,2.258063,0,0
4,2020-06-21,HJC Helmets Motorrad Grand Prix Deutschland,Sachsenring,GERMANY,50.73286,12.475105,0,1
5,2020-06-28,Motul TT Assen,TT Circuit Assen,NETHERLANDS,52.959646,6.527974,0,2
6,2020-08-09,Monster Energy Grand Prix České republiky,Automotodrom Brno,CZECH REPUBLIC,49.210243,16.450668,0,1
7,2020-08-16,myWorld Motorrad Grand Prix von Österreich,Red Bull Ring - Spielberg,AUSTRIA,47.223539,14.760464,0,1
8,2020-08-30,British Grand Prix,Silverstone Circuit,GREAT BRITAIN,52.071812,-1.014299,0,2
9,2020-09-13,Gran Premio Octo di San Marino e della Rivier...,Misano World Circuit Marco Simoncelli,ITALY,43.962211,12.684043,0,1


### In order to not reclustering each time I run the code, I will save the DF in csv files and upload them from a new notebook.

In [26]:
outliers.to_csv('../data/outliers.csv')

In [27]:
big_cluster.to_csv('../data/big_cluster.csv')