In [None]:
# Importamos las librerías que vamos a utilizar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import sklearn_pandas
from matplotlib import gridspec
from datetime import datetime

In [None]:
# Quitamos el límite de columnas y filas que se muestran en los dataframes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Importamos los csvs que vamos a utilizar
races = pd.read_csv('./csvs/races.csv')
circuits = pd.read_csv('./csvs/circuits.csv')
results = pd.read_csv('./csvs/results.csv')
driver_standings = pd.read_csv('./csvs/driver_standings.csv')
constructor_standings = pd.read_csv('./csvs/constructor_standings.csv')
qualifying = pd.read_csv('./csvs/qualifying.csv')

In [None]:
# Eliminamos las columnas que no vamos a utilizar en cada dataset
# Ahora uso la variable year que habiamos sacado previamente
# Saco la variable altitud xq no tiene sentido alguno
races.drop(['round', 'name', 'url', 'fp1_date', 'fp1_time','fp2_date', 'fp2_time','fp3_date',
            'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis=1, inplace=True)

circuits.drop(['name', 'location', 'country', 'lat', 'lng', 'url', 'alt'], axis=1, inplace=True)

results.drop(['number', 'position', 'positionText', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 
           'fastestLapTime', 'fastestLapSpeed', 'statusId', 'rank'], axis=1, inplace=True)

driver_standings.drop(['positionText'], axis=1, inplace=True)

constructor_standings.drop(['positionText'], axis=1, inplace=True)

qualifying.drop(['number'], axis=1, inplace=True)

In [None]:
# Seteamos como índice de los dataframes la columna por la que vamos a realizar joins
#races.set_index('raceId')
#circuits.set_index('circuitId')
#results.set_index('raceId')
#driver_standings.set_index('raceId')
#constructor_standings.set_index('raceId')
#qualifying.set_index('raceId')

In [None]:
# Cambiamos nombres de columnas para identificarlos mejor
qualifying.rename(columns = {'position':'q_position'}, inplace = True)
driver_standings.rename(columns = {'points':'ds_points', 'position':'ds_position', 'wins':'ds_wins'}, inplace = True)
constructor_standings.rename(columns = {'points':'cs_points', 'position':'cs_position', 'wins':'cs_wins'}, inplace = True)

In [None]:
full = pd.merge(results, qualifying, how='inner', on=['raceId', 'driverId', 'constructorId']).merge(
    races, how='inner', on=['raceId']).merge(
    circuits, how='inner', on=['circuitId'])

In [None]:
# Agregar una columna "fecha" al dataframe driver_standings que contenga la fecha de la carrera correspondiente
driver_standings = pd.merge(driver_standings, races[["raceId", "date", "year"]], on="raceId")

# Agregar una columna "fecha" al dataframe constructor_standings que contenga la fecha de la carrera correspondiente
constructor_standings = pd.merge(constructor_standings, races[["raceId", "date", "year"]], on="raceId")

# Ordenar el dataframe driver_standings por la columna "fecha" y "position"
driver_standings = driver_standings.sort_values(by=["driverId", "date"], ascending=True)

# Ordenar el dataframe constructor_standings por la columna "fecha" y "position"
constructor_standings = constructor_standings.sort_values(by=["constructorId", "date"], ascending=True)

# Convierto campo 'date' de str a datetime
driver_standings['date'] = pd.to_datetime(driver_standings['date'])
constructor_standings['date'] = pd.to_datetime(constructor_standings['date'])

# Ordenar los valores por fecha ascendente
driver_standings = driver_standings.sort_values(['driverId','date'], ascending=True)
constructor_standings = constructor_standings.sort_values(['constructorId','date'], ascending=True)

# Utilizar el método shift para obtener los valores anteriores
driver_standings['ds_prev_points'] = driver_standings.groupby(['driverId', 'year'])['ds_points'].shift(1)
driver_standings['ds_prev_position'] = driver_standings.groupby(['driverId', 'year'])['ds_position'].shift(1)
driver_standings['ds_prev_wins'] = driver_standings.groupby(['driverId', 'year'])['ds_wins'].shift(1)

constructor_standings['cs_prev_points'] = constructor_standings.groupby(['constructorId', 'year'])['cs_points'].shift(1)
constructor_standings['cs_prev_position'] = constructor_standings.groupby(['constructorId', 'year'])['cs_position'].shift(1)
constructor_standings['cs_prev_wins'] = constructor_standings.groupby(['constructorId', 'year'])['cs_wins'].shift(1)

In [None]:
driver_standings.head()

In [None]:
full.head()

In [None]:
full = pd.merge(full, driver_standings[['raceId', 'driverId', 'ds_prev_points', 'ds_prev_position', 'ds_prev_wins']], how='inner', on=['raceId', 'driverId']).merge(
    constructor_standings[['raceId', 'constructorId', 'cs_prev_points', 'cs_prev_position', 'cs_prev_wins']], how='inner', on=['raceId', 'constructorId'])

In [None]:
full.rename(columns = {'ds_prev_points':'ds_points','ds_prev_position':'ds_position','ds_prev_wins':'ds_wins',
                      'cs_prev_points':'cs_points','cs_prev_position':'cs_position','cs_prev_wins':'cs_wins'}, inplace = True)

In [None]:
full[(full.driverId == 1) & ((full.year == 2022) | (full.year == 2023)| (full.year == 2021))].sort_values('date', ascending=True)

In [None]:
full.shape

In [None]:
full.dtypes

In [None]:
full['positionOrder'].isnull().values.any()

In [None]:
# Creamos la columna 'is_podium' que es el target a predecir a partir de los datos que tenemos en positionOrder
full['is_podium'] = np.where(full['positionOrder'] <= 3, 1, 0)
full.drop(['positionOrder'], axis=1, inplace=True)
full.head()

In [None]:
full.shape

In [None]:
full.dtypes

In [None]:
# Gráfico de torta para ver el balanceo de la variable de salida
full.is_podium.value_counts().plot.pie(autopct='%.2f',figsize=(6,6), labels=['No está en el podio', 'Está en el podio'], ylabel='Cant.')

In [None]:
full.head()

In [None]:
px.histogram(full, x='grid', title='Distribución posición de salida')

In [None]:
px.histogram(full, x='grid', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
px.histogram(full, x='q_position', title='Distribución posición final clasificación')

In [None]:
px.histogram(full, x='q_position', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
def datetime_to_seconds(laptime: str) -> float:
    if laptime == '\\N' or laptime == 'nan':
        return np.nan
    time_format = "%M:%S.%f"
    dt = datetime.strptime(laptime, time_format)
    seconds = dt.minute * 60 + dt.second + dt.microsecond / 1_000_000
    return float(seconds * 1000)

In [None]:
full['q1'] = full['q1'].astype(str)
full['q1_ms'] = np.vectorize(datetime_to_seconds)(full['q1'])
full.head()

In [None]:
px.histogram(full, x='q1_ms', title='title')
# px.histogram(full[(full.circuitRef == 'albert_park') & (full.year == 2022)], x='q1_ms', title='title')

In [None]:
px.histogram(full, x='q1_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
full['q2'] = full['q2'].astype(str) # verificar si hace falta esta linea
full['q2_ms'] = np.vectorize(datetime_to_seconds)(full['q2'])
full.head()

In [None]:
px.histogram(full, x='q2_ms', title='title')

In [None]:
px.histogram(full, x='q2_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
full['q3'] = full['q3'].astype(str) # verificar si hace falta esta linea
full['q3_ms'] = np.vectorize(datetime_to_seconds)(full['q3'])
full.head()

In [None]:
px.histogram(full, x='q3_ms', title='title')

In [None]:
px.histogram(full, x='q3_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
full[['q1_ms', 'q2_ms', 'q3_ms']].isnull().sum()

In [None]:
# Sacamos columnas q1, q2 y q3 ya que utilizaremos la columna pasada a milisegundos.
full.drop(['q1', 'q2', 'q3'], axis=1, inplace=True)

In [None]:
# En el dataset se indican los valores no registrados (nulos) como '\N'. Para hacer un mejor análisis los pasamos a None o NaN
full['time'] = full['time'].replace('\\N', None)

In [None]:
# Analisis valores nulos
full.isnull().sum()

In [None]:
# Analisis valores extremos
px.box(full, y='q1_ms')

In [None]:
px.box(full, y='q2_ms')

In [None]:
px.box(full, y='q3_ms')

In [None]:
# Valor extremo encontrado en los tiempos de vuelta de q1 en milisegundos
full[full.q1_ms > 1000000]

In [None]:
# Heatmap
# Armo un df nuevo quitando IDs
final = full[['grid', 'q_position', 'ds_points', 'ds_position', 'ds_wins',
              'cs_points', 'cs_position', 'cs_wins', 'q1_ms', 'q2_ms', 'q3_ms', 'is_podium']]
final.head()

In [None]:
px.imshow(final.corr(), text_auto=True, width=800, height=800)