In [None]:
# Importamos las librerías que vamos a utilizar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import sklearn_pandas
from matplotlib import gridspec
from datetime import datetime

In [None]:
# Quitamos el límite de columnas y filas que se muestran en los dataframes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Importamos los csvs que vamos a utilizar
races = pd.read_csv('./csvs/races.csv')
circuits = pd.read_csv('./csvs/circuits.csv')
results = pd.read_csv('./csvs/results.csv')
driver_standings = pd.read_csv('./csvs/driver_standings.csv')
constructor_standings = pd.read_csv('./csvs/constructor_standings.csv')
qualifying = pd.read_csv('./csvs/qualifying.csv')

In [None]:
# Eliminamos las columnas que no vamos a utilizar en cada dataset
races.drop(['year', 'round', 'name', 'url', 'fp1_date', 'fp1_time','fp2_date', 'fp2_time','fp3_date',
            'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'], axis=1, inplace=True)

circuits.drop(['name', 'location', 'country', 'lat', 'lng', 'url'], axis=1, inplace=True)

results.drop(['number', 'position', 'positionText', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 
           'fastestLapTime', 'fastestLapSpeed', 'statusId', 'rank'], axis=1, inplace=True)

driver_standings.drop(['positionText'], axis=1, inplace=True)

constructor_standings.drop(['positionText'], axis=1, inplace=True)

qualifying.drop(['number'], axis=1, inplace=True)

In [None]:
# Seteamos como índice de los dataframes la columna por la que vamos a realizar joins
races.set_index('raceId')
circuits.set_index('circuitId')
results.set_index('raceId')
driver_standings.set_index('raceId')
constructor_standings.set_index('raceId')
qualifying.set_index('raceId')

In [None]:
# Cambiamos nombres de columnas para identificarlos mejor
qualifying.rename(columns = {'position':'q_position'}, inplace = True)
driver_standings.rename(columns = {'points':'ds_points', 'position':'ds_position', 'wins':'ds_wins'}, inplace = True)
constructor_standings.rename(columns = {'points':'cs_points', 'position':'cs_position', 'wins':'cs_wins'}, inplace = True)
circuits.rename(columns = {'alt':'altitud'}, inplace = True)

In [None]:
full = pd.merge(results, qualifying, how='inner', on=['raceId', 'driverId', 'constructorId']).merge(
    driver_standings, how='inner', on=['raceId', 'driverId']).merge(
    constructor_standings, how='inner', on=['raceId', 'constructorId']).merge(races, how='inner', on=['raceId']).merge(
    circuits, how='inner', on=['circuitId'])

In [None]:
full.head()

In [None]:
full.shape

In [None]:
full.dtypes

In [None]:
full['positionOrder'].isnull().values.any()

In [None]:
# Creamos la columna 'is_podium' que es el target a predecir a partir de los datos que tenemos en positionOrder
full['is_podium'] = np.where(full['positionOrder'] <= 3, 1, 0)
full.drop(['positionOrder'], axis=1, inplace=True)
full.head()

In [None]:
full.shape

In [None]:
full.dtypes

In [None]:
# Gráfico de torta para ver el balanceo de la variable de salida
full.is_podium.value_counts().plot.pie(autopct='%.2f',figsize=(6,6), labels=['No está en el podio', 'Está en el podio'], ylabel='Cant.')

In [None]:
full.head()

In [None]:
px.histogram(full, x='grid', title='Distribución posición de salida')

In [None]:
px.histogram(full, x='grid', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
px.histogram(full, x='q_position', title='Distribución posición final clasificación')

In [None]:
px.histogram(full, x='q_position', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
def datetime_to_seconds(laptime: str) -> int:
    if laptime == '\\N' or laptime == 'nan':
        return 0
    time_format = "%M:%S.%f"
    dt = datetime.strptime(laptime, time_format)
    seconds = dt.minute * 60 + dt.second + dt.microsecond / 1_000_000
    return int(seconds * 1000)

In [None]:
full['q1'] = full['q1'].astype(str) # verificar si hace falta esta linea
full['q1_ms'] = np.vectorize(datetime_to_seconds)(full['q1'])
full.head()

In [None]:
px.histogram(full, x='q1_ms', title='title')

In [None]:
px.histogram(full, x='q1_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
full['q2'] = full['q2'].astype(str) # verificar si hace falta esta linea
full['q2_ms'] = np.vectorize(datetime_to_seconds)(full['q2'])
full.head()

In [None]:
px.histogram(full, x='q2_ms', title='title')

In [None]:
px.histogram(full, x='q2_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])

In [None]:
full['q3'] = full['q3'].astype(str) # verificar si hace falta esta linea
full['q3_ms'] = np.vectorize(datetime_to_seconds)(full['q3'])
full.head()

In [None]:
px.histogram(full, x='q3_ms', title='title')

In [None]:
px.histogram(full, x='q3_ms', color='is_podium', title='Distribución', barmode='group', labels=['Podio','No podio'])