In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,  roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl.metadata (89 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl.metadata (60 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl (12.6 MB)
Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl (6.9 MB)
Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Using cached pytz-2025.2-py2.py3-none-a

In [3]:
# Cargamos los datasets principales a usar.
results = pd.read_csv('results.csv')
races = pd.read_csv('races.csv')
drivers = pd.read_csv('drivers.csv')
constructors = pd.read_csv('constructors.csv')
qualifying = pd.read_csv('qualifying.csv')
status = pd.read_csv('status.csv')

In [4]:
# Inspeccionar las columnas y las primeras filas de cada dataset
print("Resultados:")
print(results.head())
print(results.columns)

print("\nCarreras:")
print(races.head())
print(races.columns)

print("\nPilotos:")
print(drivers.head())
print(drivers.columns)

print("\nConstructores:")
print(constructors.head())
print(constructors.columns)

print("\nClasificación:")
print(qualifying.head())
print(qualifying.columns)

print("\nEstado:")
print(status.head())
print(status.columns)

Resultados:
   resultId  raceId  driverId  constructorId number  grid position  \
0         1      18         1              1     22     1        1   
1         2      18         2              2      3     5        2   
2         3      18         3              3      7     7        3   
3         4      18         4              4      5    11        4   
4         5      18         5              1     23     3        5   

  positionText  positionOrder  points  laps         time milliseconds  \
0            1              1    10.0    58  1:34:50.616      5690616   
1            2              2     8.0    58       +5.478      5696094   
2            3              3     6.0    58       +8.163      5698779   
3            4              4     5.0    58      +17.181      5707797   
4            5              5     4.0    58      +18.014      5708630   

  fastestLap rank fastestLapTime fastestLapSpeed  statusId  
0         39    2       1:27.452         218.300         1  
1     

In [5]:
#Combinamos los resultados de las carreras.
merged_df = pd.merge(results, races, on='raceId', how='inner')

# Combinamos con los pilotos.
merged_df = pd.merge(merged_df, drivers, on='driverId', how='inner')

# Combinamos con los constructores.
merged_df = pd.merge(merged_df, constructors, on='constructorId', how='inner')

#Verificamos el dataframe combinado.
print(merged_df.head())
print(merged_df.columns)

   resultId  raceId  driverId  constructorId number_x  grid position  \
0         1      18         1              1       22     1        1   
1         2      18         2              2        3     5        2   
2         3      18         3              3        7     7        3   
3         4      18         4              4        5    11        4   
4         5      18         5              1       23     3        5   

  positionText  positionOrder  points  ...  code  forename     surname  \
0            1              1    10.0  ...   HAM     Lewis    Hamilton   
1            2              2     8.0  ...   HEI      Nick    Heidfeld   
2            3              3     6.0  ...   ROS      Nico     Rosberg   
3            4              4     5.0  ...   ALO  Fernando      Alonso   
4            5              5     4.0  ...   KOV    Heikki  Kovalainen   

          dob nationality_x                                           url_y  \
0  1985-01-07       British     http://en.w

In [6]:
merged_df['name_x'].head()

0    Australian Grand Prix
1    Australian Grand Prix
2    Australian Grand Prix
3    Australian Grand Prix
4    Australian Grand Prix
Name: name_x, dtype: object

In [7]:
# Seleccionamos las columnas relevantes para el modelo.
df_model = merged_df[[
    'raceId', 'driverId', 'constructorId', 'grid', 'position','points',
    'name_x', 'name_y', 'circuitId', 'surname', 'positionOrder'
]].copy()

# Renombramos columnas para mayor claridad. 
df_model.rename(columns={
    'name_x': 'circuit_name',
    'surname': 'driver_name',
    'name_y': 'team_name'
}, inplace=True)

df_model.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,points,circuit_name,team_name,circuitId,driver_name,positionOrder
0,18,1,1,1,1,10.0,Australian Grand Prix,McLaren,1,Hamilton,1
1,18,2,2,5,2,8.0,Australian Grand Prix,BMW Sauber,1,Heidfeld,2
2,18,3,3,7,3,6.0,Australian Grand Prix,Williams,1,Rosberg,3
3,18,4,4,11,4,5.0,Australian Grand Prix,Renault,1,Alonso,4
4,18,5,1,3,5,4.0,Australian Grand Prix,McLaren,1,Kovalainen,5


In [8]:
# Creamos la variable objetivo.
df_model['podium'] = df_model['positionOrder'].apply(lambda x: 1 if x in [1,2,3] else 0)

In [9]:
# Codificar los nombres de pilotos, equipos y circuitos
le_driver = LabelEncoder()
le_team = LabelEncoder()
le_circuit = LabelEncoder()

df_model['driver_encoded'] = le_driver.fit_transform(df_model['driver_name'])
df_model['team_encoded'] = le_team.fit_transform(df_model['team_name'])
df_model['circuit_encoded'] = le_circuit.fit_transform(df_model['circuit_name'])

# Historial del piloto en el circuito
pilot_history = df_model.groupby(['driver_encoded', 'circuit_encoded'])['positionOrder'].mean().reset_index()
pilot_history.rename(columns={'positionOrder': 'avg_position'}, inplace=True)
df_model = df_model.merge(pilot_history, on=['driver_encoded', 'circuit_encoded'], how='left')

# Historial del equipo en el circuito
team_history = df_model.groupby(['team_encoded', 'circuit_encoded'])['positionOrder'].mean().reset_index()
team_history.rename(columns={'positionOrder': 'team_avg_position'}, inplace=True)
df_model = df_model.merge(team_history, on=['team_encoded', 'circuit_encoded'], how='left')

# Escalar la columna 'points'
scaler = StandardScaler()
df_model['points_scaled'] = scaler.fit_transform(df_model[['points']])
df_model['grid_scaled'] = scaler.fit_transform(df_model[['grid']])

#Aumentar el peso de pasion de salida
df_model['grid_weighted'] = df_model['grid_scaled'] * 10
df_model['points_scaled'] = df_model['points_scaled']/10

In [10]:
# Crear una interacción entre grid y avg_position (ya existente)
df_model['grid_avg_interaction'] = df_model['grid_scaled'] * df_model['avg_position']

# Crear nuevas interacciones
df_model['grid_weighted_squared'] = df_model['grid_scaled'] ** 5  # Aumentar el peso de grid_scaled
df_model['grid_points_interaction'] = df_model['grid_scaled'] * df_model['points_scaled']  # Interacción entre grid_scaled y points_scaled
df_model['grid_weighted_inverse'] = 1 / (df_model['grid_scaled'] + 1)  # Inverso de grid_scaled para capturar no linealidades

# Reducir el peso de points_scaled
df_model['points_scaled_log'] = np.log1p(df_model['points_scaled'])  # Logaritmo de points_scaled para reducir su impacto

#Nuevas interacciones relacionadas con el historial del piloto en cada cirucito.
df_model['pilot_circuit_experience'] = df_model['avg_position'] / (df_model['team_avg_position'] +1)
df_model['grid_pilot_circuit_interaction'] = df_model['grid_scaled'] *df_model['pilot_circuit_experience']

# Verificar las nuevas columnas
print(df_model[['grid_scaled', 'points_scaled', 'grid_avg_interaction', 'grid_weighted_squared', 
                'grid_points_interaction', 'grid_weighted_inverse', 'points_scaled_log', 
                'pilot_circuit_experience', 'grid_pilot_circuit_interaction']].head())

   grid_scaled  points_scaled  grid_avg_interaction  grid_weighted_squared  \
0    -1.407078       0.184145             -8.090698          -5.515575e+00   
1    -0.851733       0.138180             -8.827047          -4.482459e-01   
2    -0.574060       0.092214             -4.857430          -6.234278e-02   
3    -0.018715       0.069232             -0.122137          -2.295605e-09   
4    -1.129405       0.046249            -15.623440          -1.837592e+00   

   grid_points_interaction  grid_weighted_inverse  points_scaled_log  \
0                -0.259106              -2.456532           0.169021   
1                -0.117692               6.744571           0.129430   
2                -0.052937               2.347748           0.088207   
3                -0.001296               1.019071           0.066941   
4                -0.052234              -7.727659           0.045212   

   pilot_circuit_experience  grid_pilot_circuit_interaction  
0                  0.581117         

In [11]:
# Actualizar X con las nuevas interacciones
X = df_model[['grid_scaled', 'avg_position', 'team_avg_position',
                'grid_avg_interaction', 'driver_encoded','team_encoded', 'circuit_encoded',
                'grid_weighted_squared', 'grid_points_interaction', 'grid_weighted_inverse', 
              'pilot_circuit_experience', 'grid_pilot_circuit_interaction']]
y = df_model['podium']

In [12]:
from sklearn.preprocessing import StandardScaler

 # Escalar las características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Guardar el escalador
joblib.dump(scaler, 'scaler.pkl')
print("Escalador actualizado y guardado como 'scaler.pkl'")

Escalador actualizado y guardado como 'scaler.pkl'


In [13]:
# Dividimos los datos en train y test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# configuracion de GridSeachCV
model = RandomForestClassifier(random_state=42)
model.fit(X_scaled, y)


In [14]:
# Calcular el AUC
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.2f}")

ROC AUC: 0.83




In [15]:
feature_importances = model.feature_importances_
features = ['grid_scaled', 'avg_position', 'team_avg_position', 'grid_avg_interaction', 
              'grid_weighted_squared', 'grid_weighted_inverse', 
              'pilot_circuit_experience', 'grid_pilot_circuit_interaction', 
              'driver_encoded', 'team_encoded', 'circuit_encoded']
for feature, importance in zip(features, feature_importances):
    print(f"{feature}: {importance:.4f}")

grid_scaled: 0.0480
avg_position: 0.0578
team_avg_position: 0.0515
grid_avg_interaction: 0.0472
grid_weighted_squared: 0.0357
grid_weighted_inverse: 0.0353
pilot_circuit_experience: 0.0237
grid_pilot_circuit_interaction: 0.0498
driver_encoded: 0.4786
team_encoded: 0.0957
circuit_encoded: 0.0346


In [16]:
correlation = df_model[['points_scaled_log', 'podium']].corr()
print(correlation)

                   points_scaled_log    podium
points_scaled_log           1.000000  0.719062
podium                      0.719062  1.000000


In [None]:
joblib.dump(model, 'f1_predictions_model.pkl')

['f1_predictions_model.pkl']