In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score,r2_score,mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from xgboost import XGBRegressor
np.set_printoptions(precision=4)


In [2]:
data = pd.read_csv('final_df.csv')

In [3]:
data.tail()

Unnamed: 0,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,nationality,constructor,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,driver_age
25022,2022,4,imola,False,False,False,True,True,latifi,Canadian,williams,19,16,0,0,20,0,0,0,26
25023,2022,4,imola,False,False,False,True,True,mick_schumacher,German,haas,10,17,0,0,17,0,0,0,23
25024,2022,4,imola,False,False,False,True,True,ricciardo,Australian,mclaren,6,18,8,0,11,0,0,0,32
25025,2022,4,imola,False,False,False,True,True,alonso,Spanish,alpine,9,19,2,0,14,0,0,0,40
25026,2022,4,imola,False,False,False,True,True,sainz,Spanish,ferrari,4,20,33,0,3,0,0,0,27


In [4]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()


In [5]:
data['driver'] = le1.fit_transform(data['driver'])
data['circuit_id'] = le2.fit_transform(data['circuit_id'])
data['nationality'] = le3.fit_transform(data['nationality'])
data['constructor'] = le4.fit_transform(data['constructor'])

In [21]:
# scoring function for regression
df = data.copy()

#train split
train = df[df.season <2022]
X_train = train.drop(['podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

comparison_dict ={'model':[],
                  'score': [],
                  'podium_score':[]}

def score_regression(model):
    appended_data=[]
    score = 0
    podium_score=0
    for circuit in df[df.season == 2022]['round'].unique():

        test = df[(df.season == 2022) & (df['round'] == circuit)]
        X_test = test.drop(['podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('podium', ascending = True, inplace = True)
        prediction_df['actual'] = prediction_df['podium'].map(lambda x: 1 if x == 1 else 0)
        
        #inverse transforming the DataFrame to make it readable
        ######
        X_test = pd.DataFrame(scaler.inverse_transform(X_test), columns = X_test.columns)
        X_test['circuit_id'] = X_test['circuit_id'].fillna(0).astype(np.int64, errors='ignore')
        X_test['driver'] = X_test['driver'].fillna(0).astype(np.int64, errors='ignore') 
        prediction_df['driver'] = le1.inverse_transform(X_test['driver'])
        prediction_df['circuit_id'] = le2.inverse_transform(X_test['circuit_id'])
        ######
        rank=prediction_df.drop(['results'],axis=1)         
        ######
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x==0 else 0)
        
        #Scoring
        score += r2_score(prediction_df.actual, prediction_df.predicted)
        podium_score+=precision_score(prediction_df.actual, prediction_df.predicted)
        
        
        dff=prediction_df.drop(['actual','podium','circuit_id'],axis=1)
        final= pd.merge(rank,dff,on='driver')
        final.sort_values('results', ascending = True, inplace = True)
        final=final[['results','driver','predicted','actual','podium','circuit_id']]
        final=final.reset_index()
        final=final.drop(['index'],axis=1)
        final['index'] = final.index+1
        final.rename(columns = {'index':'predicted2'}, inplace = True)
        final=final.drop(['predicted','actual','results'],axis=1)
        final.rename(columns = {'predicted2':'predicted'}, inplace = True)
        final=final[['circuit_id','driver','podium','predicted']]
        appended_data.append(final)


    appended_data = pd.concat(appended_data)



      ##importing df to pickle 
#     appended_data.to_excel('df.xlsx', index = False)
#     import pickle
#     appended_data.to_pickle('appended_data.pkl') # save
#     appended_data = pd.read_pickle('appended_data.pkl') # load
    

    display(appended_data)

    model_score = score / df[df.season == 2022]['round'].unique().max()
    podium_score = podium_score / df[df.season == 2022]['round'].unique().max()

        

 
    return model_score,podium_score;








Unnamed: 0,circuit_id,driver,podium,predicted
0,bahrain,leclerc,1,1
1,bahrain,sainz,2,2
2,bahrain,hamilton,3,3
3,bahrain,perez,18,4
4,bahrain,kevin_magnussen,5,5
...,...,...,...,...
15,imola,mick_schumacher,17,16
16,imola,zhou,15,17
17,imola,albon,11,18
18,imola,latifi,16,19


IndexError: Column(s) score already selected

In [23]:
# Linear Regression


# model1 = LinearRegression()
# model1.fit(X_train, y_train)
# model_score = score_regression(model1)
# comparison_dict['model'].append('linear_regression')
# comparison_dict['score'].append(model_score)


#Random Forest


# model2 = RandomForestRegressor(n_estimators = 100, random_state = 0)
# model2.fit(X_train, y_train)
# model_score = score_regression(model2)
# comparison_dict['model'].append('random_forest_regressor')
# comparison_dict['score'].append(model_score)
    

#XGBoost


model3 = XGBRegressor(n_estimators=100, max_depth=50, eta=0.1, subsample=1.0, colsample_bytree=1.0)
model3.fit(X_train, y_train)
model_score,podium_score = score_regression(model3)

comparison_dict['model'].append('XGBoost')
comparison_dict['score'].append(model_score)
comparison_dict['podium_score'].append(podium_score)


  
comparison_dict

{'model': ['XGBoost'], 'score': [0.4736842105263156], 'podium_score': [0.75]}
