In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

In [2]:
# Load the dataset
df = pd.read_csv("hopes2.csv")
df.fillna(0, inplace=True)
df.drop(['Location'], axis=1, inplace=True)

In [3]:
pd.DataFrame(df)


Unnamed: 0,LapTime,LapNumber,Stint,Sector1Time,Sector2Time,Sector3Time,SpeedI1,SpeedI2,SpeedFL,SpeedST,Compound,TyreLife,TrackStatus,Position,Pit,PitCompound
0,107.6,1,1,18.0,44.7,42.4,278.0,301,216,298.0,MEDIUM,4,2,3,0,MEDIUM
1,103.3,2,1,18.0,43.9,41.3,276.0,284,214,295.0,MEDIUM,5,1,3,0,MEDIUM
2,103.2,3,1,18.1,43.9,41.3,272.0,285,214,293.0,MEDIUM,6,1,3,0,MEDIUM
3,103.3,4,1,18.0,44.0,41.3,271.0,283,215,293.0,MEDIUM,7,1,3,0,MEDIUM
4,103.5,5,1,18.1,44.1,41.3,270.0,282,215,291.0,MEDIUM,8,1,3,0,MEDIUM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6106,92.9,46,2,34.2,29.3,29.4,292.0,307,299,311.0,HARD,39,1,1,0,HARD
6107,92.8,47,2,34.4,29.0,29.3,290.0,304,297,309.0,HARD,40,1,1,0,HARD
6108,92.6,48,2,34.1,29.1,29.3,292.0,305,296,310.0,HARD,41,1,1,0,HARD
6109,92.2,49,2,33.8,29.1,29.2,293.0,303,296,309.0,HARD,42,1,1,0,HARD


In [4]:
# Split features and target
X = df.iloc[:, :-2].values
y = df.iloc[:, -2].values

In [5]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,107.6,1,1,18.0,44.7,42.4,278.0,301,216,298.0,MEDIUM,4,2,3
1,103.3,2,1,18.0,43.9,41.3,276.0,284,214,295.0,MEDIUM,5,1,3
2,103.2,3,1,18.1,43.9,41.3,272.0,285,214,293.0,MEDIUM,6,1,3
3,103.3,4,1,18.0,44.0,41.3,271.0,283,215,293.0,MEDIUM,7,1,3
4,103.5,5,1,18.1,44.1,41.3,270.0,282,215,291.0,MEDIUM,8,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6106,92.9,46,2,34.2,29.3,29.4,292.0,307,299,311.0,HARD,39,1,1
6107,92.8,47,2,34.4,29.0,29.3,290.0,304,297,309.0,HARD,40,1,1
6108,92.6,48,2,34.1,29.1,29.3,292.0,305,296,310.0,HARD,41,1,1
6109,92.2,49,2,33.8,29.1,29.2,293.0,303,296,309.0,HARD,42,1,1


In [6]:
# Apply one-hot encoding to the categorical feature before splitting
categorical_features = [10]  # Adjust this index as per your data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# Save the ColumnTransformer for future use
with open('column_transformer.pkl', 'wb') as f:
    pickle.dump(ct, f)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



In [7]:
# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("F1 Score:", f1)
    print("Accuracy Score:", acc)
    return f1, acc

# Train and evaluate a Random Forest model with predefined parameters
rf_classifier = RandomForestClassifier(
    class_weight={0: 1, 1: 30},
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)
rf_classifier.fit(X_train, y_train)
print("Random Forest:")
evaluate_model(rf_classifier, X_test, y_test)



Random Forest:
Confusion Matrix:
 [[1173    9]
 [  24   17]]
F1 Score: 0.5074626865671642
Accuracy Score: 0.9730171708912511


(np.float64(0.5074626865671642), 0.9730171708912511)

In [8]:
# Train and evaluate an XGBoost model with predefined parameters
xgb_classifier = xgb.XGBClassifier(
    colsample_bytree=0.9,
    learning_rate=0.2,
    max_depth=5,
    n_estimators=300,
    scale_pos_weight=20,
    subsample=0.9,
    objective='binary:logistic',
    random_state=42,
    tree_method='gpu_hist',
    gpu_id=0
)
xgb_classifier.fit(X_train, y_train)
print("XGBoost:")
evaluate_model(xgb_classifier, X_test, y_test)




    E.g. tree_method = "hist", device = "cuda"



XGBoost:
Confusion Matrix:
 [[1179    3]
 [  20   21]]
F1 Score: 0.6461538461538462
Accuracy Score: 0.9811937857726901


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




(np.float64(0.6461538461538462), 0.9811937857726901)

In [9]:
# Save the models for future use
with open('rf_classifier.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)

with open('xgb_classifier.pkl', 'wb') as f:
    pickle.dump(xgb_classifier, f)