In [1]:
#Predicción de la clasificación final de pilotos de F1

#El objetivo de este proyecto es predecir la posición final de los pilotos en el campeonato de F1 basado en el rendimiento de carreras pasadas , voy a realizar un análisis de datos históricos de pilotos y sus equipos para crear un modelo que prediga el ranking final de la temporada. 

Python 3.10.12


In [52]:
#Imports

In [53]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [54]:
#Ergest APi Request

In [55]:
import requests

def get_race_results(season):
    url = f"http://ergast.com/api/f1/{season}/results.json?limit=1000"
    response = requests.get(url)
    
    # Verificamos que la solicitud sea exitosa
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Error {response.status_code}")
        return None

# Probar 2022
data_2022 = get_race_results(2022)


In [56]:
import json

# estructura
print(json.dumps(data_2022, indent=4))


{
    "MRData": {
        "xmlns": "http://ergast.com/mrd/1.5",
        "series": "f1",
        "url": "http://ergast.com/api/f1/2022/results.json",
        "limit": "100",
        "offset": "0",
        "total": "440",
        "RaceTable": {
            "season": "2022",
            "Races": [
                {
                    "season": "2022",
                    "round": "1",
                    "url": "http://en.wikipedia.org/wiki/2022_Bahrain_Grand_Prix",
                    "raceName": "Bahrain Grand Prix",
                    "Circuit": {
                        "circuitId": "bahrain",
                        "url": "http://en.wikipedia.org/wiki/Bahrain_International_Circuit",
                        "circuitName": "Bahrain International Circuit",
                        "Location": {
                            "lat": "26.0325",
                            "long": "50.5106",
                            "locality": "Sakhir",
                            "country": "Bahrain"
 

In [57]:
#Extraer y Organizar Datos


In [58]:
#  DataFrame
def process_race_results(data):
    races = data['MRData']['RaceTable']['Races']
    results = []
    
    for race in races:
        race_name = race['raceName']
        date = race['date']
        
        for result in race['Results']:
            driver = result['Driver']['familyName']
            constructor = result['Constructor']['name']
            position = int(result['position'])
            points = float(result['points'])
            
            results.append({
                'race': race_name,
                'date': date,
                'driver': driver,
                'constructor': constructor,
                'position': position,
                'points': points
            })
    
    # Convertimos los resultados en un DataFrame
    return pd.DataFrame(results)

# Procesamos los datos de la temporada 2022
df_2022 = process_race_results(data_2022)
df_2022.head()


Unnamed: 0,race,date,driver,constructor,position,points
0,Bahrain Grand Prix,2022-03-20,Leclerc,Ferrari,1,26.0
1,Bahrain Grand Prix,2022-03-20,Sainz,Ferrari,2,18.0
2,Bahrain Grand Prix,2022-03-20,Hamilton,Mercedes,3,15.0
3,Bahrain Grand Prix,2022-03-20,Russell,Mercedes,4,12.0
4,Bahrain Grand Prix,2022-03-20,Magnussen,Haas F1 Team,5,10.0


In [59]:
#Agregar y calcular caracteristicas ütiles(Puntos acumulados, posicion promedio DNF)

In [60]:
# puntos piloto
df_points = df_2022.groupby('driver')['points'].sum().reset_index()
df_points.columns = ['driver', 'total_points']

# posición promedio 
df_position_avg = df_2022.groupby('driver')['position'].mean().reset_index()
df_position_avg.columns = ['driver', 'average_position']

# DNF
df_dnf = df_2022[df_2022['position'] > 20].groupby('driver').size().reset_index(name='dnf_count')

# unir al Dataframe
df_features = df_points.merge(df_position_avg, on='driver').merge(df_dnf, on='driver', how='left')
df_features['dnf_count'] = df_features['dnf_count'].fillna(0)  # Reemplazamos NaN por 0 en los abandonos
df_features.head()


Unnamed: 0,driver,total_points,average_position,dnf_count
0,Albon,3.0,11.4,0.0
1,Alonso,2.0,14.4,0.0
2,Bottas,28.0,8.2,0.0
3,Gasly,6.0,13.4,0.0
4,Hamilton,36.0,7.2,0.0


In [61]:
#Crear conjunto de entrenamiento y prueba

In [62]:
from sklearn.model_selection import train_test_split

# Definimos nuestras variables
X = df_features[['total_points', 'average_position', 'dnf_count']]
y = df_features['total_points']  

# Dividimos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
#Entrenar Modelo Básico


In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Inicializamos los modelos
linear_model = LinearRegression()
random_forest = RandomForestRegressor(random_state=42)

# Entrenamos los modelos
linear_model.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Hacemos predicciones y calculamos el error
y_pred_linear = linear_model.predict(X_test)
y_pred_rf = random_forest.predict(X_test)

# Calculamos el MAE para evaluar el rendimiento
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"MAE (Regresión Lineal): {mae_linear}")
print(f"MAE (Random Forest): {mae_rf}")


MAE (Regresión Lineal): 2.5182085316663563e-14
MAE (Random Forest): 2.2660000000000005


In [65]:
# Verificación de la separacion de datos


In [66]:
# Inspeccionamos el conjunto de entrenamiento
print("Conjunto de Entrenamiento:")
print(X_train.head())

# Inspeccionamos el conjunto de prueba
print("\nConjunto de Prueba:")
print(X_test.head())

entrenamiento_prueba_comun = set(X_train.index).intersection(set(X_test.index))
print(f"\nNúmero de elementos en común entre entrenamiento y prueba: {len(entrenamiento_prueba_comun)}")


Conjunto de Entrenamiento:
    total_points  average_position  dnf_count
5            0.0              14.5        0.0
11          60.0               6.0        0.0
3            6.0              13.4        0.0
18          77.0               8.0        0.0
16           2.0              11.4        0.0

Conjunto de Prueba:
    total_points  average_position  dnf_count
0            3.0              11.4        0.0
17          10.0              12.2        0.0
15           0.0              15.2        0.0
1            2.0              14.4        0.0
8           14.0              10.6        0.0

Número de elementos en común entre entrenamiento y prueba: 0


In [67]:
#Revisar la coorrelacion de caracteristicas

In [68]:
# Matriz de correlación 
df_corr = df_features[['total_points', 'average_position', 'dnf_count']]
correlation_matrix = df_corr.corr()
print("Matriz de correlación:")
print(correlation_matrix)


Matriz de correlación:
                  total_points  average_position  dnf_count
total_points          1.000000         -0.891058        NaN
average_position     -0.891058          1.000000        NaN
dnf_count                  NaN               NaN        NaN


In [69]:
#random forest y ridge regression

In [70]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Inicializamos el modelo de Ridge con un valor alfa que controla la regularización
ridge_model = Ridge(alpha=1.0)
random_forest = RandomForestRegressor(random_state=42)

# Entrenamos ambos modelos
ridge_model.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Hacemos predicciones y calculamos el MAE para ambos modelos
y_pred_ridge = ridge_model.predict(X_test)
y_pred_rf = random_forest.predict(X_test)

mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"MAE (Ridge Regression): {mae_ridge}")
print(f"MAE (Random Forest): {mae_rf}")


MAE (Ridge Regression): 0.0028532476315296608
MAE (Random Forest): 2.2660000000000005


In [71]:
#validación cruzada de Random Forest

In [72]:
from sklearn.model_selection import cross_val_score

# Realizamos validación cruzada para el modelo de Random Forest
scores_rf = cross_val_score(random_forest, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f"MAE promedio (Random Forest, CV): {-scores_rf.mean()}")


MAE promedio (Random Forest, CV): 6.1331999999999995


In [73]:
#Ontencion da datos de Más temporadas

In [74]:
# Función para obtener datos de varias temporadas
def get_multiple_season_data(start_season, end_season):
    all_results = []
    for season in range(start_season, end_season + 1):
        data = get_race_results(season)
        if data:
            df_season = process_race_results(data)
            df_season['season'] = season  # Agregar columna de temporada
            all_results.append(df_season)
    return pd.concat(all_results, ignore_index=True)

# Obtener datos desde 2015 a 2022
df_all_seasons = get_multiple_season_data(2015, 2022)
df_all_seasons.head()


Unnamed: 0,race,date,driver,constructor,position,points,season
0,Australian Grand Prix,2015-03-15,Hamilton,Mercedes,1,25.0,2015
1,Australian Grand Prix,2015-03-15,Rosberg,Mercedes,2,18.0,2015
2,Australian Grand Prix,2015-03-15,Vettel,Ferrari,3,15.0,2015
3,Australian Grand Prix,2015-03-15,Massa,Williams,4,12.0,2015
4,Australian Grand Prix,2015-03-15,Nasr,Sauber,5,10.0,2015


In [75]:
#Calcular  average position y  dnf count para cada piloto en cada temporada

In [76]:
# Calcular la posición promedio y el conteo de DNF para cada piloto en cada temporada
df_all_seasons['dnf_count'] = df_all_seasons['position'].apply(lambda x: 1 if x > 20 else 0)  # Asumimos que posiciones > 20 son DNF
df_features_seasonal = df_all_seasons.groupby(['season', 'driver']).agg({
    'points': 'sum',
    'position': 'mean',
    'dnf_count': 'sum'
}).reset_index()

# Cambiamos el nombre de la columna 'position' a 'average_position'
df_features_seasonal.rename(columns={'position': 'average_position'}, inplace=True)

# Calculamos la posición final en el campeonato para cada temporada
df_features_seasonal['final_position'] = df_features_seasonal.groupby('season')['points'].rank(ascending=False)

df_features_seasonal.head()


Unnamed: 0,season,driver,points,average_position,dnf_count,final_position
0,2015,Alonso,0.0,15.25,0,18.5
1,2015,Bottas,42.0,7.4,0,5.0
2,2015,Button,0.0,15.6,0,18.5
3,2015,Ericsson,5.0,13.0,0,14.0
4,2015,Grosjean,16.0,9.4,0,8.0


In [77]:
#Entrenamiento del Modelo de predicción

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Definimos X e y
X = df_features_seasonal[['points', 'average_position', 'dnf_count']]
y = df_features_seasonal['final_position']

# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializamos el modelo de Random Forest
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)

# Realizamos predicciones y calculamos el MAE para evaluar el rendimiento
y_pred_rf = random_forest.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"MAE (Random Forest): {mae_rf}")


MAE (Random Forest): 0.7347794117647058


In [79]:
#Obtener datos de 2023 y 2024

In [80]:
# Obtener datos desde 2015 hasta 2024
df_all_seasons = get_multiple_season_data(2015, 2024)

# Volver a calcular las características como hicimos antes
df_all_seasons['dnf_count'] = df_all_seasons['position'].apply(lambda x: 1 if x > 20 else 0)  # Asumimos que posiciones > 20 son DNF
df_features_seasonal = df_all_seasons.groupby(['season', 'driver']).agg({
    'points': 'sum',
    'position': 'mean',
    'dnf_count': 'sum'
}).reset_index()

# Cambiamos el nombre de la columna 'position' a 'average_position'
df_features_seasonal.rename(columns={'position': 'average_position'}, inplace=True)

# Calculamos la posición final en el campeonato para cada temporada
df_features_seasonal['final_position'] = df_features_seasonal.groupby('season')['points'].rank(ascending=False)

df_features_seasonal.head()


Unnamed: 0,season,driver,points,average_position,dnf_count,final_position
0,2015,Alonso,0.0,15.25,0,18.5
1,2015,Bottas,42.0,7.4,0,5.0
2,2015,Button,0.0,15.6,0,18.5
3,2015,Ericsson,5.0,13.0,0,14.0
4,2015,Grosjean,16.0,9.4,0,8.0


In [81]:
#Reentrenar el modelo con los nuevos datos

In [82]:
# Definir X e y nuevamente
X = df_features_seasonal[['points', 'average_position', 'dnf_count']]
y = df_features_seasonal['final_position']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo de Random Forest
random_forest.fit(X_train, y_train)

# Evaluar el modelo con MAE
y_pred_rf = random_forest.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"MAE (Random Forest): {mae_rf}")


MAE (Random Forest): 1.028467970521542


In [83]:
#Calcular datos promedio de cada piloto

In [84]:
# Calcular los datos promedio para cada piloto hasta la última temporada
df_pilot_avg = df_features_seasonal.groupby('driver').agg({
    'points': 'mean',
    'average_position': 'mean',
    'dnf_count': 'mean'
}).reset_index()

df_pilot_avg.head()


Unnamed: 0,driver,points,average_position,dnf_count
0,Albon,8.6,11.6,0.0
1,Alonso,18.75,11.03125,0.125
2,Bearman,6.0,7.0,0.0
3,Bottas,44.9,7.96,0.0
4,Button,1.5,14.4,0.0


In [85]:
#Modificacion de la funcion para usar el nombre del piloto

In [86]:
# Nueva función de predicción basada en el nombre del piloto
def predict_final_position(pilot_name):
    # Obtener los datos del piloto
    pilot_data = df_pilot_avg[df_pilot_avg['driver'] == pilot_name]
    if pilot_data.empty:
        return "Piloto no encontrado."
    
    # Extraer puntos, posición promedio y conteo de DNF
    input_data = pilot_data[['points', 'average_position', 'dnf_count']].values
    prediction = random_forest.predict(input_data)
    return round(prediction[0])


In [87]:
#Obtener lista de pilotos activos en 2024

In [88]:
import requests

def get_current_drivers(season=2024):
    url = f"http://ergast.com/api/f1/{season}/drivers.json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        drivers = [driver['familyName'] for driver in data['MRData']['DriverTable']['Drivers']]
        return drivers
    else:
        print(f"Error {response.status_code}")
        return []

# Obtener la lista de pilotos activos en 2024
active_drivers_2024 = get_current_drivers()
print(active_drivers_2024)


['Albon', 'Alonso', 'Bearman', 'Bottas', 'Colapinto', 'Gasly', 'Hamilton', 'Hülkenberg', 'Lawson', 'Leclerc', 'Magnussen', 'Norris', 'Ocon', 'Pérez', 'Piastri', 'Ricciardo', 'Russell', 'Sainz', 'Sargeant', 'Stroll', 'Tsunoda', 'Verstappen', 'Zhou']


In [89]:
#Filtrar el Dataframe de pilotos

In [90]:
# Filtrar df_pilot_avg para incluir solo los pilotos activos en 2024
df_pilot_avg = df_pilot_avg[df_pilot_avg['driver'].isin(active_drivers_2024)]

# Actualizar la lista de nombres de pilotos para el dropdown
pilot_names = df_pilot_avg['driver'].tolist()


In [91]:
!pip install gradio


Defaulting to user installation because normal site-packages is not writeable


In [92]:
#interfaz de  Gradio

In [103]:
import gradio as gr

# Interfaz de Gradio actualizada
interface = gr.Interface(
    fn=predict_final_position,
    inputs=gr.Dropdown(choices=pilot_names, label="Selecciona un piloto"),
    outputs="number",
    title="F1 Final Position Predictor",
    description="Selecciona un piloto para predecir su posición final en la temporada 2024."
)

# Ejecutar la interfaz
interface.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




