In [None]:
import sys
import os
sys_path = os.path.abspath(os.path.join('..'))
import pandas as pd
from pathlib import Path
import mlflow 
from catboost import CatBoostRegressor, Pool

project_root = Path(os.path.abspath(os.path.join('..')))  
sys.path.insert(0, str(project_root))

from config import PROJECT_ROOT, DATA_RAW_DIR, SEASONS, START_YEAR, END_YEAR, TEST_SIZE, RANDOM_STATE, MODELS_DIR

# mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("F1_Advanced_Analysis")

#import plotly.express as px
#import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, ParameterGrid
from xgboost import XGBRegressor
#import numpy as np
# Load the dataset
df = pd.read_csv(DATA_RAW_DIR / f"all_races_{START_YEAR}_{END_YEAR}.csv")
# Display basic information about the dataset and checking datatypes
df.head()
df.info()

In [None]:
# Create a copy of the original dataframe for cleaning
df_clean = df.copy()

In [None]:
df_clean = df.copy()

for (year, gp_name) in df_clean.groupby(['Year', 'Grand Prix']).groups.keys():
    # Mask for rows with GridPosition == 0 in the specific year and Grand Prix
    mask = (df_clean['Year'] == year) & (df_clean['Grand Prix'] == gp_name)
    zero_positions = df_clean.loc[mask & (df_clean['GridPosition'] == 0)].index
    
    if len(zero_positions) > 0:
        gp_positions = df_clean.loc[mask, 'GridPosition']
        available_positions = []
        
        # Find available grid positions (1 to 22)
        if (gp_positions == 21).sum() == 0:
            available_positions.append(21)
        if (gp_positions == 22).sum() == 0:
            available_positions.append(22)
        
        # Distribute (maximum one car per position)
        for i, idx in enumerate(zero_positions):
            if i < len(available_positions):
                df_clean.loc[idx, 'GridPosition'] = available_positions[i]
            else:
                df_clean.loc[idx, 'GridPosition'] = available_positions[-1]
        
        print(f"{year} {gp_name}: заменили {len(zero_positions)} нулей → {available_positions[:len(zero_positions)]}")


2022 Austrian Grand Prix: заменили 1 нулей → [21]
2022 Belgian Grand Prix: заменили 2 нулей → [21, 22]
2022 Emilia Romagna Grand Prix: заменили 1 нулей → [21]
2022 Hungarian Grand Prix: заменили 1 нулей → [21]
2022 Miami Grand Prix: заменили 2 нулей → [21, 22]
2022 Singapore Grand Prix: заменили 1 нулей → [21]
2022 São Paulo Grand Prix: заменили 1 нулей → [21]
2022 United States Grand Prix: заменили 1 нулей → [21]
2023 Dutch Grand Prix: заменили 1 нулей → [21]
2023 Mexico City Grand Prix: заменили 1 нулей → [21]
2024 Mexico City Grand Prix: заменили 1 нулей → [21]


In [None]:
# Convert 'ClassifiedPosition' to numeric and drop rows with NaN values
df_clean['ClassifiedPosition'] = pd.to_numeric(df_clean['ClassifiedPosition'], errors='coerce')
df_clean.dropna(subset=['ClassifiedPosition'], inplace=True)

In [None]:
# Create a copy of the cleaned dataframe for catboost
df_clean_cat = df_clean.copy()

In [None]:
# One-hot encoding for categorical variables
df_clean = pd.get_dummies(df_clean, columns = ['TeamName'], drop_first=True, prefix='Team')
df_clean = pd.get_dummies(df_clean, columns = ['Grand Prix'], drop_first=True, prefix='GP')
df_clean = pd.get_dummies(df_clean, columns = ['FullName'], drop_first=True) # ЭТО добавилось
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1636 entries, 0 to 1857
Data columns (total 76 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Position                        1636 non-null   float64
 1   GridPosition                    1636 non-null   float64
 2   Status                          1636 non-null   object 
 3   Points                          1636 non-null   float64
 4   ClassifiedPosition              1636 non-null   float64
 5   Laps                            1636 non-null   float64
 6   Time                            1512 non-null   object 
 7   Year                            1636 non-null   int64  
 8   Team_AlphaTauri                 1636 non-null   bool   
 9   Team_Alpine                     1636 non-null   bool   
 10  Team_Aston Martin               1636 non-null   bool   
 11  Team_Ferrari                    1636 non-null   bool   
 12  Team_Haas F1 Team               1636 no

In [None]:
# Convert boolean columns to integers
bools_cols = df_clean.select_dtypes(include=['bool']).columns
df_clean[bools_cols] = df_clean[bools_cols].astype('int64')

df_clean.replace({True: 1, False: 0}, inplace=True)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1636 entries, 0 to 1857
Data columns (total 76 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Position                        1636 non-null   float64
 1   GridPosition                    1636 non-null   float64
 2   Status                          1636 non-null   object 
 3   Points                          1636 non-null   float64
 4   ClassifiedPosition              1636 non-null   float64
 5   Laps                            1636 non-null   float64
 6   Time                            1512 non-null   object 
 7   Year                            1636 non-null   int64  
 8   Team_AlphaTauri                 1636 non-null   int64  
 9   Team_Alpine                     1636 non-null   int64  
 10  Team_Aston Martin               1636 non-null   int64  
 11  Team_Ferrari                    1636 non-null   int64  
 12  Team_Haas F1 Team               1636 no

In [None]:
# Prepare final dataframe for RF and XGB models
df_final = df_clean.copy()
df_final.drop(columns=['Time', 'Status', 'Laps', 'Points'], inplace=True)

In [None]:
# Define features and target variable
x = df_final.drop(columns=['Position', 'ClassifiedPosition'])
y = df_final['Position']

In [None]:
# Split the data into training and testing sets based on Year
train_mask = df_final['Year'].between(2022, 2024)
test_mask = df_final['Year'] == 2025

x_train = x[train_mask]
x_test = x[test_mask]

y_train = y[train_mask]
y_test = y[test_mask]

indices_train = df_final[train_mask].index
indices_test = df_final[test_mask].index

In [None]:
# Train and evaluate Linear Regression model
LinearRegression_model_v2 = LinearRegression()
with mlflow.start_run(run_name="LinearRegression_YearSplit"):
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("split_type", "year_split")
    mlflow.log_param("train_years", "2022-2024")
    mlflow.log_param("test_year", "2025")
    
    LinearRegression_model_v2.fit(x_train, y_train)
    
    LinearRegression_pred = LinearRegression_model_v2.predict(x_test)
    LinearRegression_mae_v2 = mean_absolute_error(y_test, LinearRegression_pred)
    LinearRegression_r2_v2 = r2_score(y_test, LinearRegression_pred)
    
    mlflow.log_metric("LinearRegression_mae_v2", LinearRegression_mae_v2)
    mlflow.log_metric("LinearRegression_r2_v2", LinearRegression_r2_v2)
    mlflow.sklearn.log_model(LinearRegression_model_v2, name="LinearRegression_model_v2")
    
    print(f"MAE: {LinearRegression_mae_v2:.4f}, R²: {LinearRegression_r2_v2:.4f}")

MAE: 2.9771, R²: 0.4943


In [None]:
# Prepare dataframe for CatBoost model
df_clean_cat.drop(columns=['Status', 'Points', 'Time', 'ClassifiedPosition', 'Laps'], inplace=True)
cat_features = ['FullName', 'TeamName', 'Grand Prix']
df_clean_cat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1636 entries, 0 to 1857
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Position      1636 non-null   float64
 1   GridPosition  1636 non-null   float64
 2   FullName      1636 non-null   object 
 3   TeamName      1636 non-null   object 
 4   Year          1636 non-null   int64  
 5   Grand Prix    1636 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 89.5+ KB


In [None]:
# Define features and target variable for CatBoost
x_cat = df_clean_cat.drop(columns=['Position'])
y_cat = df_clean_cat['Position']

train_mask_cat = df_clean_cat['Year'].between(2022, 2024)
test_mask_cat = df_clean_cat['Year'] == 2025

x_train_cat = x_cat[train_mask_cat]
x_test_cat = x_cat[test_mask_cat]

y_train_cat = y_cat[train_mask]
y_test_cat = y_cat[test_mask]

indices_train_cat = df_clean_cat[train_mask_cat].index
indices_test_cat = df_clean_cat[test_mask_cat].index

In [16]:
# RF
tscv = TimeSeriesSplit(n_splits=3)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
    }
rf_grid_v1 = GridSearchCV(
    RandomForestRegressor(random_state = RANDOM_STATE),
    rf_params,
    cv = tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)
rf_grid_v1.fit(x_train, y_train)
best_rf = rf_grid_v1.best_estimator_

with mlflow.start_run(run_name="RandomForest_TimeSeriesCV_Best"):
    mlflow.log_params(rf_grid_v1.best_params_)
    mlflow.log_param("cv_type", "TimeSeriesSplit_3")
    mlflow.log_param("cv_mae", -rf_grid_v1.best_score_)
    mlflow.log_param("model", "RandomForest_Best")
    
    RF_tscv_pred = best_rf.predict(x_test)
    RF_tscv_mae = mean_absolute_error(y_test, RF_tscv_pred)
    RF_tscv_r2 = r2_score(y_test, RF_tscv_pred)
    mlflow.log_metric("RF_tscv_mae", RF_tscv_mae)
    mlflow.log_metric("RF_tscv_r2", RF_tscv_r2)
    mlflow.sklearn.log_model(best_rf, name="rf_tscv_best")
    
    print(f"RF MAE: {RF_tscv_mae:.4f}, R²: {RF_tscv_r2:.4f}")

RF MAE: 2.5618, R²: 0.5752


In [17]:
# XgBoost

tscv = TimeSeriesSplit(n_splits=3)
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb_grid_v1 = GridSearchCV(
    XGBRegressor(random_state=RANDOM_STATE, objective='reg:squarederror'),
    xgb_params,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

xgb_grid_v1.fit(x_train, y_train)
best_xgb = xgb_grid_v1.best_estimator_

with mlflow.start_run(run_name="XGBoost_TimeSeriesCV_Best"):
    mlflow.log_params(xgb_grid_v1.best_params_)
    mlflow.log_param("cv_type", "TimeSeriesSplit_3")
    mlflow.log_param("cv_mae", -xgb_grid_v1.best_score_)
    mlflow.log_param("model", "XGBoost_Best")
    
    xgb_tscv_pred = best_xgb.predict(x_test)
    xgb_tscv_mae = mean_absolute_error(y_test, xgb_tscv_pred)
    xgb_tscv_r2 = r2_score(y_test, xgb_tscv_pred)
    mlflow.log_metric("xgb_tscv_mae", xgb_tscv_mae)
    mlflow.log_metric("xgb_tscv_r2", xgb_tscv_r2)
    mlflow.xgboost.log_model(best_xgb, name="xgb_tscv_best")
    
    print(f"XGB MAE: {xgb_tscv_mae:.4f}, R²: {xgb_tscv_r2:.4f}")



XGB MAE: 2.6278, R²: 0.5680


In [None]:
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np
import mlflow

# Parameters and splits  
cat_params = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}
param_grid = list(ParameterGrid(cat_params))
tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(x_train_cat))

# PARENT RUN
with mlflow.start_run(run_name="CatBoost_ParameterGrid_TSCV"):
    mlflow.log_params({"total_combinations": len(param_grid), "n_splits": 3, "cv_type": "TimeSeriesSplit"})
    
    best_mae = float('inf')
    best_params = None
    
    # Iterate over each combination of parameters
    for i, params in enumerate(param_grid):
        with mlflow.start_run(nested=True, run_name=f"trial_{i+1}_{hash(str(params)) % 1000}"):
            # Log parameters of THIS trial
            mlflow.log_params(params)
            mlflow.log_param("trial_number", i+1)
            
            fold_maes = []
            
            for fold, (train_idx, val_idx) in enumerate(splits):
                X_tr = x_train_cat.iloc[train_idx]
                y_tr = y_train_cat.iloc[train_idx]
                X_val = x_train_cat.iloc[val_idx]
                y_val = y_train_cat.iloc[val_idx]
                
                model = CatBoostRegressor(
                    cat_features=cat_features, random_seed=42, verbose=0,
                    early_stopping_rounds=50, loss_function='MAE', **params
                )
                model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)
                
                val_pred = model.predict(X_val)
                fold_mae = mean_absolute_error(y_val, val_pred)
                fold_maes.append(fold_mae)
            
            cv_mae = np.mean(fold_maes)
            
            # Log metrics of THIS trial
            mlflow.log_metric("cv_mae", cv_mae)
            for j, fold_mae in enumerate(fold_maes):
                mlflow.log_metric(f"fold_{j+1}_mae", fold_mae)
            
            print(f"Trial {i+1}: {params} → CV MAE: {cv_mae:.4f}")
            
            if cv_mae < best_mae:
                best_mae = cv_mae
                best_params = params.copy()
    
    # Log best info to parent run
    mlflow.log_param("best_params", best_params)
    mlflow.log_metric("best_cv_mae", best_mae)

# Final model (separate run)
with mlflow.start_run(run_name="CatBoost_Final_Best"):
    best_cat = CatBoostRegressor(
        cat_features=cat_features, random_seed=42, verbose=0,
        early_stopping_rounds=50, loss_function='MAE', **best_params
    )
    best_cat.fit(x_train_cat, y_train_cat)
    
    pred = best_cat.predict(x_test_cat)
    test_mae = mean_absolute_error(y_test_cat, pred)
    test_r2 = r2_score(y_test_cat, pred)
    
    mlflow.log_params(best_params)
    mlflow.log_param("cv_mae", best_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.catboost.log_model(best_cat, "catboost_best")
    
    print(f"FINAL Test MAE: {test_mae:.4f}, R²: {test_r2:.4f}")


Trial 1: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.01} → CV MAE: 2.5891
Trial 2: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.05} → CV MAE: 2.5416
Trial 3: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.1} → CV MAE: 2.5555
Trial 4: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.01} → CV MAE: 2.5741
Trial 5: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05} → CV MAE: 2.5754
Trial 6: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.1} → CV MAE: 2.5243
Trial 7: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.01} → CV MAE: 2.5791
Trial 8: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.05} → CV MAE: 2.5849
Trial 9: {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 5, 'learning_rate': 0.1} → CV MAE: 2.5545
Trial 10: {'depth': 6, 'iterations': 1000, 'l2_leaf_reg': 1, 'learning_rate': 0.01} → CV MAE: 2.5779
Trial 11: {



FINAL Test MAE: 2.6490, R²: 0.5389


In [None]:
# Save models and feature lists
import joblib
from config import MODELS_DIR

MODELS_DIR.mkdir(parents=True, exist_ok=True)

# RandomForest
joblib.dump(best_rf, MODELS_DIR / 'rf_model_v1.pkl')
rf_features = x_train.columns.tolist()
joblib.dump(rf_features, MODELS_DIR / 'rf_features_v1.pkl')

# XGBoost  
joblib.dump(best_xgb, MODELS_DIR / 'xgboost_model_v1.pkl')
xgb_features = x_train.columns.tolist()
joblib.dump(xgb_features, MODELS_DIR / 'xgboost_features_v1.pkl')

# CatBoost
best_cat.save_model(str(MODELS_DIR / 'catboost_model_v1.cbm'))
cat_features_list = x_train_cat.columns.tolist()
joblib.dump(cat_features_list, MODELS_DIR / 'catboost_features_v1.pkl')
