In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Importing data

In [20]:
lapData  = pd.read_table('./racedata.csv',sep=',')
fastestLaps = pd.read_table('./fastestLaps.csv',sep=',')
# Preview
lapData['constructor'] = lapData['constructor'].astype(np.str0)
lapData['avglap']  = lapData['avglap'].astype(np.float64)
lapData['year']    = lapData['year'].astype(np.int64)
lapData['circuit'] = lapData['circuit'].astype(np.str0)
lapData['podium']  = lapData['podium'].astype(np.int8)

fastestLaps['avglap']       = fastestLaps['avglap'].astype(np.str0)
fastestLaps['year']         = fastestLaps['year'].astype(np.int64)
fastestLaps['circuit']      = fastestLaps['circuit'].astype(np.str0)
fastestLaps['podium']       = fastestLaps['podium'].astype(np.int8)
lapData.head()
fastestLaps.head()

Unnamed: 0,year,circuit,fastestLap,lap,avglap,podium
0,2014,Albert Park Grand Prix Circuit,19,19,92478,1
1,2014,Autódromo José Carlos Pace,62,62,73555,1
2,2014,Autodromo Nazionale di Monza,29,29,88004,1
3,2014,Bahrain International Circuit,49,49,97020,1
4,2014,Circuit de Barcelona-Catalunya,55,55,88918,1


### Split test / train data

In [21]:
YData = lapData['podium']
XData0 =  pd.get_dummies(lapData,prefix=['constructor','circuit'],columns=['constructor','circuit']).astype('float')

lapData1 = lapData[['avglap','year','circuit']]
XData1 =  pd.get_dummies(lapData1,prefix=['circuit'],columns=['circuit']).astype('float')

lapData2 = lapData[['avglap','circuit']]
XData2 =  pd.get_dummies(lapData2,prefix=['circuit'],columns=['circuit']).astype('float')


In [22]:
XTrain0, XTest0, YTrain0, YTest0 = train_test_split(XData0, YData, test_size=0.3, random_state=2)

XTrain1, XTest1, YTrain1, YTest1 = train_test_split(XData1, YData, test_size=0.3, random_state=4)

XTrain2, XTest2, YTrain2, YTest2 = train_test_split(XData2, YData, test_size=0.3, random_state=6)

# Model Creation + Training

#### Utillities

In [23]:
from sklearn.model_selection import GridSearchCV

paramGrid = [
    {
        'penalty' : ['l1','l2','elasticnet','none'],
        'C' : np.logspace(-4,4,20),
        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
        'max_iter' : [100,1000,2500,5000]
    }
]

### Fully featured model

In [5]:
model = LogisticRegression()
model.fit(XTrain0,YTrain0)
print(model.score(XTest0,YTest0))

1.0


In [6]:
predictions0 = model.predict(XTest0)
print(metrics.confusion_matrix(YTest0,predictions0))

[[375   0]
 [  0 169]]


### Year, Circuit and Avg Lap model

In [13]:
# Creation and grid search
model1 = LogisticRegression()

clf =  GridSearchCV(model1,param_grid=paramGrid, cv=3, verbose=True, n_jobs=1)


In [None]:
bestClf = clf.fit(XTrain1,YTrain1)

In [16]:
bestClf.best_estimator_

LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')

In [24]:
# Optimizing hyperparameters
model1 = LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')
model1.fit(XTrain1,YTrain1)
print(model1.score(XTest1,YTest1))

0.6893382352941176


In [25]:
predictions1 = model1.predict(XTest1)
print(metrics.confusion_matrix(YTest1,predictions1))

[[375   0]
 [169   0]]


### Circuit and Avg Lap model

In [10]:
model2 = LogisticRegression()
model2.fit(XTrain2,YTrain2)
print(model2.score(XTest2,YTest2))

0.6893382352941176


In [11]:
predictions2 = model2.predict(XTest2)
print(metrics.confusion_matrix(YTest2,predictions2))

[[375   0]
 [169   0]]


# Testing