In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pycaret.regression import *

In [4]:
df = pd.read_csv("C:\\Users\\ripa_\\Desktop\\Programing\\IndyCar_Project\\datasets\\IndyCar_dataset_v11.csv")

In [5]:
df["EventDate"] = pd.to_datetime(df["EventDate"])
df = df.sort_values("EventDate")

In [6]:
print(df[["DriverID", "NormalizedPositionFinish", "DRFAvg"]].groupby("DriverID").head(3).head(30))

    DriverID  NormalizedPositionFinish  DRFAvg
0       3625                      0.76     NaN
25      3667                      0.24     NaN
24      3622                      0.00     NaN
23      3616                      0.16     NaN
22      4276                      0.20     NaN
21      4215                      0.40     NaN
20      3811                      0.84     NaN
19      3736                      0.72     NaN
17      3680                      0.28     NaN
16      3648                      0.96     NaN
15      3620                      0.68     NaN
14      3672                      0.60     NaN
13      4401                      0.88     NaN
18      4407                      0.64     NaN
11      4236                      1.00     NaN
12      4021                      0.80     NaN
2       3645                      0.08     NaN
3       3813                      0.92     NaN
4       4216                      0.12     NaN
5       3675                      0.56     NaN
1       3608 

In [7]:
df.head()

Unnamed: 0,DriverName,DriverID,Rookie,DRFAvg,DTAvg,DTTAvg,DNFRate,TDNFRate,DriverElo,DriverTElo,...,EventDate,EventDateFormatted,EventID,Era,EraID,Status,StatusID,FieldSize,PositionFinish,NormalizedPositionFinish
0,Mike Conway,3625,0,,,,,,1500.0,1500.0,...,2012-03-25,"Sunday, March 25, 2012",2380,DW12 Era 2012-2017,0,DNF,1,26,20,0.76
25,Will Power,3667,0,,,,,,1500.0,1500.0,...,2012-03-25,"Sunday, March 25, 2012",2380,DW12 Era 2012-2017,0,Running,0,26,7,0.24
24,Helio Castroneves,3622,0,,,,,,1500.0,1500.0,...,2012-03-25,"Sunday, March 25, 2012",2380,DW12 Era 2012-2017,0,Running,0,26,1,0.0
23,Ryan Briscoe,3616,0,,,,,,1500.0,1500.0,...,2012-03-25,"Sunday, March 25, 2012",2380,DW12 Era 2012-2017,0,Running,0,26,5,0.16
22,Simon Pagenaud,4276,0,,,,,,1500.0,1500.0,...,2012-03-25,"Sunday, March 25, 2012",2380,DW12 Era 2012-2017,0,Running,0,26,6,0.2


In [8]:
drop_cols = [
    "DriverName", "PositionStart", "TeamName", "EventName", "Track", "EventTrackType",
    "EventDate", "EventDateFormatted", "EventID", "Era",
    "Status", "StatusID", "PositionFinish"
]

cutoff = df["EventDate"].quantile(0.95)
data = df[df["EventDate"] < cutoff].drop(columns=drop_cols)
data_unseen = df[df["EventDate"] >= cutoff].drop(columns=drop_cols)

print(data.corr(numeric_only=True)["NormalizedPositionFinish"].sort_values())

DriverElo                  -3.981294e-01
TeamElo                    -3.296387e-01
TeamTElo                   -2.501114e-01
DriverTElo                 -2.471319e-01
TeamID                     -1.376309e-01
Rookie                     -2.423952e-02
TrackID                    -2.617858e-03
FieldSize                  -2.349151e-13
EventTrackTypeID            7.174214e-13
EraID                       1.687663e-12
TeamDNFRate                 3.193332e-02
TDNFRate                    6.753540e-02
DriverID                    1.024562e-01
DNFRate                     1.683375e-01
DTAvg                       2.638985e-01
TTP                         3.109530e-01
TRP                         3.136064e-01
TeamRitmo                   3.370879e-01
DRFAvg                      3.609328e-01
DTTAvg                      3.662953e-01
DriverRitmo                 3.975282e-01
NormalizedPositionFinish    1.000000e+00
Name: NormalizedPositionFinish, dtype: float64


In [9]:
df = df.drop(columns=drop_cols)

In [10]:
print(df.columns.tolist())

['DriverID', 'Rookie', 'DRFAvg', 'DTAvg', 'DTTAvg', 'DNFRate', 'TDNFRate', 'DriverElo', 'DriverTElo', 'DriverRitmo', 'TeamID', 'TRP', 'TTP', 'TeamDNFRate', 'TeamElo', 'TeamTElo', 'TeamRitmo', 'TrackID', 'EventTrackTypeID', 'EraID', 'FieldSize', 'NormalizedPositionFinish']


In [11]:
exp = setup(
    data=data, 
    target="NormalizedPositionFinish", 
    session_id=123, 
    fold_strategy="timeseries",
    data_split_shuffle=False,
    fold_shuffle=False
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,NormalizedPositionFinish
2,Target type,Regression
3,Original data shape,"(5236, 22)"
4,Transformed data shape,"(5236, 22)"
5,Transformed train set shape,"(3665, 22)"
6,Transformed test set shape,"(1571, 22)"
7,Numeric features,21
8,Rows with missing values,29.1%
9,Preprocess,True


In [16]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,0.2335,0.0778,0.2788,0.1426,0.1902,0.9808,0.01
lasso,Lasso Regression,0.2347,0.0778,0.2789,0.1421,0.1905,0.9975,0.01
llar,Lasso Least Angle Regression,0.2347,0.0778,0.2789,0.1421,0.1905,0.9975,0.01
br,Bayesian Ridge,0.2321,0.0779,0.279,0.1408,0.1902,0.9614,0.011
ridge,Ridge Regression,0.2344,0.0779,0.279,0.1405,0.1914,1.0053,0.011
omp,Orthogonal Matching Pursuit,0.2338,0.0784,0.2798,0.136,0.1907,0.97,0.01
lr,Linear Regression,0.2361,0.0802,0.2828,0.1147,0.1925,1.01,0.012
ada,AdaBoost Regressor,0.2422,0.0804,0.2835,0.1131,0.1954,1.0884,0.038
et,Extra Trees Regressor,0.2412,0.0821,0.2866,0.0941,0.197,1.0535,0.193
rf,Random Forest Regressor,0.2425,0.0823,0.2867,0.0927,0.1969,1.0579,0.398


In [17]:
gbr = create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2576,0.0959,0.3097,-0.0726,0.2118,1.0435
1,0.2578,0.0957,0.3093,-0.032,0.2128,1.0873
2,0.2493,0.0906,0.301,-0.0091,0.2066,1.0438
3,0.2426,0.0822,0.2868,0.0918,0.197,1.0279
4,0.2411,0.082,0.2864,0.1065,0.1961,1.0189
5,0.227,0.0748,0.2735,0.1714,0.1865,0.9599
6,0.2244,0.0751,0.274,0.1741,0.1855,0.8632
7,0.2334,0.0802,0.2833,0.1153,0.1925,0.9327
8,0.2209,0.0693,0.2632,0.2247,0.1787,0.9189
9,0.2209,0.0712,0.2668,0.2209,0.1816,0.8948


In [18]:
gbr_tune = tune_model(gbr)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2428,0.0809,0.2843,0.0958,0.1954,1.089
1,0.2481,0.0846,0.2908,0.0881,0.2007,1.0906
2,0.2381,0.0793,0.2816,0.1164,0.1935,1.0398
3,0.2422,0.0791,0.2813,0.1263,0.1928,1.0219
4,0.2418,0.081,0.2846,0.1179,0.1957,1.0398
5,0.2277,0.0723,0.269,0.1984,0.1858,1.0302
6,0.2304,0.0754,0.2746,0.1703,0.1881,0.9755
7,0.2339,0.0752,0.2741,0.1713,0.1868,0.9928
8,0.2277,0.0714,0.2671,0.2014,0.1829,0.995
9,0.2256,0.0703,0.2652,0.2301,0.1812,0.979


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [24]:
predict_model(gbr);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2121,0.0669,0.2586,0.2537,0.1757,0.8674


In [27]:
newpred1 = predict_model(gbr, data=data_unseen)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2193,0.0748,0.2735,0.1665,0.1855,0.968


In [12]:
cat = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2541,0.0913,0.3021,-0.0207,0.2077,1.1456
1,0.2539,0.0926,0.3043,0.0012,0.2094,1.0918
2,0.2534,0.0916,0.3027,-0.0207,0.2087,1.0834
3,0.2522,0.0895,0.2992,0.0117,0.2059,1.0648
4,0.2514,0.0905,0.3008,0.0145,0.2058,1.0571
5,0.2384,0.0838,0.2896,0.071,0.2006,1.1243
6,0.2338,0.0793,0.2816,0.128,0.1922,0.9378
7,0.2428,0.085,0.2916,0.0624,0.1986,0.9705
8,0.2418,0.0819,0.2863,0.0829,0.1948,0.9866
9,0.2352,0.0797,0.2822,0.1281,0.1921,0.9661


In [13]:
cat_tune = tune_model(cat)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2456,0.083,0.2881,0.0718,0.198,1.1231
1,0.2534,0.0883,0.2972,0.0473,0.2044,1.0807
2,0.2402,0.0823,0.2868,0.0834,0.1978,1.0618
3,0.2404,0.0795,0.282,0.1217,0.1936,1.0229
4,0.2438,0.0829,0.288,0.0965,0.1965,1.0058
5,0.2254,0.0725,0.2692,0.197,0.1858,1.0309
6,0.2288,0.0751,0.274,0.1741,0.1878,0.9539
7,0.232,0.0769,0.2773,0.1522,0.188,0.9376
8,0.231,0.0754,0.2747,0.1555,0.1872,0.9658
9,0.2261,0.0727,0.2697,0.2039,0.1836,0.9765


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [14]:
predict_model(cat);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.2267,0.0744,0.2727,0.1701,0.1866,1.006


In [15]:
newpred2 = predict_model(cat, data=data_unseen)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.2387,0.0837,0.2893,0.0674,0.1971,1.0718


In [32]:
lgbm = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2562,0.0949,0.3081,-0.0616,0.2095,1.0452
1,0.2678,0.102,0.3194,-0.1005,0.2183,1.0914
2,0.2503,0.0945,0.3074,-0.0526,0.2107,0.9922
3,0.2388,0.0823,0.287,0.0907,0.1974,0.9652
4,0.2447,0.0871,0.2952,0.0508,0.2019,1.0224
5,0.2272,0.0752,0.2743,0.1663,0.1875,0.9304
6,0.2258,0.0779,0.2791,0.1435,0.1893,0.8504
7,0.2304,0.0801,0.2831,0.1165,0.1913,0.9019
8,0.2313,0.0777,0.2788,0.1301,0.1899,0.9779
9,0.2229,0.0737,0.2715,0.1929,0.1845,0.8816


In [33]:
lgbm_tune = tune_model(lgbm)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.235,0.0772,0.2778,0.1368,0.189,0.9983
1,0.2448,0.083,0.288,0.1053,0.1977,1.0412
2,0.2369,0.0807,0.284,0.1011,0.1953,1.0352
3,0.2344,0.076,0.2756,0.1612,0.1874,0.9486
4,0.2355,0.0793,0.2816,0.1361,0.1919,0.9623
5,0.2183,0.0682,0.2611,0.2448,0.1782,0.9401
6,0.2223,0.0725,0.2692,0.2027,0.1828,0.8903
7,0.2263,0.0743,0.2726,0.1807,0.1842,0.9063
8,0.2223,0.0703,0.2651,0.2135,0.1803,0.9279
9,0.2203,0.0691,0.2628,0.2441,0.1791,0.9374


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [35]:
predict_model(lgbm);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.2124,0.0679,0.2606,0.242,0.1766,0.8595


In [36]:
newpred3 = predict_model(lgbm, data=data_unseen)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.221,0.0753,0.2745,0.1605,0.1857,0.9474


In [38]:
save_model(cat, "indycar_cat_model_v2")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Rookie', 'DRFAvg', 'DTAvg',
                                              'DTTAvg', 'DNFRate', 'TDNFRate',
                                              'DriverElo', 'DriverTElo',
                                              'DriverRitmo', 'PositionStart',
                                              'TRP', 'TTP', 'TeamDNFRate',
                                              'TeamElo', 'TeamTElo', 'TeamRitmo',
                                              'FieldSize'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model',
                  <catboost.core.CatBoostRegressor object at 0x00000258AD660E50>)]),
 'indycar_cat_model_v2.pkl')