In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics



# Importing data

In [34]:
lapData  = pd.read_table('./racedata.csv',sep=',').sample(frac=1)
fastestLaps = pd.read_table('./fastestLaps.csv',sep=',').sample(frac=1)
# Preview
lapData['constructor'] = lapData['constructor'].astype(np.str0)
lapData['avglap']  = lapData['avglap'].astype(np.float64)
lapData['year']    = lapData['year'].astype(np.int64)
lapData['circuit'] = lapData['circuit'].astype(np.str0)
lapData['podium']  = lapData['podium'].astype(np.int8)

fastestLaps['avglap']       = fastestLaps['avglap'].astype(np.str0)
fastestLaps['year']         = fastestLaps['year'].astype(np.int64)
fastestLaps['circuit']      = fastestLaps['circuit'].astype(np.str0)
fastestLaps['podium']       = fastestLaps['podium'].astype(np.int8)

data = pd.concat([lapData,fastestLaps])
# scramble data
data


Unnamed: 0,constructor,avglap,year,circuit,podium,fastestLap,lap
1781,Red Bull,105611.49,2020,Yas Marina Circuit,0,,
898,Mercedes,105225.32,2016,Circuit of the Americas,1,,
1618,McLaren,106884.55,2019,Sochi Autodrom,0,,
179,Mercedes,142032.88,2020,Autodromo Internazionale del Mugello,1,,
645,Renault,79482.64,2018,Circuit de Monaco,0,,
...,...,...,...,...,...,...,...
60,,74820,2017,Circuit de Monaco,1,76.0,76.0
97,,74279,2019,Circuit de Monaco,1,72.0,72.0
123,,65619,2020,Red Bull Ring,1,68.0,68.0
113,,78833,2020,Autodromo Internazionale del Mugello,1,58.0,58.0


### Split test / train data

In [40]:
YData = data['podium']
# XData0 =  pd.get_dummies(data,prefix=['constructor','circuit'],columns=['constructor','circuit']).astype('float')

lapData1 = data[['avglap','year','circuit']]
XData1 =  pd.get_dummies(lapData1,prefix=['circuit'],columns=['circuit']).astype('float')

lapData2 = data[['avglap','circuit']]
XData2 =  pd.get_dummies(lapData2,prefix=['circuit'],columns=['circuit']).astype('float')


In [41]:
# XTrain0, XTest0, YTrain0, YTest0 = train_test_split(XData0, YData, test_size=0.3, random_state=2)

XTrain1, XTest1, YTrain1, YTest1 = train_test_split(XData1, YData, test_size=0.3, random_state=4)

XTrain2, XTest2, YTrain2, YTest2 = train_test_split(XData2, YData, test_size=0.3, random_state=6)

# Model Creation + Training

#### Utillities

In [23]:
from sklearn.model_selection import GridSearchCV

paramGrid = [
    {
        'penalty' : ['l1','l2','elasticnet','none'],
        'C' : np.logspace(-4,4,20),
        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
        'max_iter' : [100,1000,2500,5000]
    }
]

### Fully featured model

In [5]:
model = LogisticRegression()
model.fit(XTrain0,YTrain0)
print(model.score(XTest0,YTest0))

1.0


In [42]:
from sklearn.tree import DecisionTreeClassifier
tree =  DecisionTreeClassifier().fit(XTrain1,YTrain1)
tree.score(XTest1,YTest1)

0.8151260504201681

In [43]:
predictionsT = tree.predict(XTest1)
print(metrics.confusion_matrix(YTest1,predictionsT))

[[336  49]
 [ 61 149]]


In [44]:
tree2 =  DecisionTreeClassifier().fit(XTrain2,YTrain2)
print(tree2.score(XTest2,YTest2))

predictionsT1 = tree2.predict(XTest2)
print(metrics.confusion_matrix(YTest2,predictionsT1))

0.7747899159663866
[[307  58]
 [ 76 154]]


In [6]:
predictions0 = model.predict(XTest0)
print(metrics.confusion_matrix(YTest0,predictions0))

[[375   0]
 [  0 169]]


### Year, Circuit and Avg Lap model

In [13]:
# Creation and grid search
model1 = LogisticRegression()

clf =  GridSearchCV(model1,param_grid=paramGrid, cv=3, verbose=True, n_jobs=1)


In [None]:
bestClf = clf.fit(XTrain1,YTrain1)

In [16]:
bestClf.best_estimator_

LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')

In [32]:
# Optimizing hyperparameters
model1 = LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')
model1.fit(XTrain1,YTrain1)
print(model1.score(XTest1,YTest1))

0.6134453781512605


In [33]:
predictions1 = model1.predict(XTest1)
print(metrics.confusion_matrix(YTest1,predictions1))

[[365   0]
 [230   0]]


### Circuit and Avg Lap model

In [10]:
model2 = LogisticRegression()
model2.fit(XTrain2,YTrain2)
print(model2.score(XTest2,YTest2))

0.6893382352941176


In [11]:
predictions2 = model2.predict(XTest2)
print(metrics.confusion_matrix(YTest2,predictions2))

[[375   0]
 [169   0]]


# Testing