In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Importing data

In [2]:
lapData  = pd.read_table('./racedata.csv',sep=',')
# Preview
lapData['constructor'] = lapData['constructor'].astype(np.str0)
lapData['avglap']  = lapData['avglap'].astype(np.float64)
lapData['year']    = lapData['year'].astype(np.int64)
lapData['circuit'] = lapData['circuit'].astype(np.str0)
lapData['podium']  = lapData['podium'].astype(np.int8)
lapData.head()

Unnamed: 0,constructor,avglap,year,circuit,podium
0,Mercedes,97872.11,2014,Albert Park Grand Prix Circuit,1
1,McLaren,98341.88,2014,Albert Park Grand Prix Circuit,1
2,McLaren,98398.89,2014,Albert Park Grand Prix Circuit,1
3,Ferrari,98491.12,2014,Albert Park Grand Prix Circuit,0
4,Williams,98707.88,2014,Albert Park Grand Prix Circuit,0


### Split test / train data

In [3]:
YData = lapData['podium']
XData0 =  pd.get_dummies(lapData,prefix=['constructor','circuit'],columns=['constructor','circuit']).astype('float')

lapData1 = lapData[['avglap','year','circuit']]
XData1 =  pd.get_dummies(lapData1,prefix=['circuit'],columns=['circuit']).astype('float')

lapData2 = lapData[['avglap','circuit']]
XData2 =  pd.get_dummies(lapData2,prefix=['circuit'],columns=['circuit']).astype('float')


In [4]:
XTrain0, XTest0, YTrain0, YTest0 = train_test_split(XData0, YData, test_size=0.3, random_state=2)

XTrain1, XTest1, YTrain1, YTest1 = train_test_split(XData1, YData, test_size=0.3, random_state=2)

XTrain2, XTest2, YTrain2, YTest2 = train_test_split(XData2, YData, test_size=0.3, random_state=2)

# Model Creation + Training

#### Utillities

In [12]:
from sklearn.model_selection import GridSearchCV

paramGrid = [
    {
        'penalty' : ['l1','l2','elasticnet','none'],
        'C' : np.logspace(-4,4,20),
        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
        'max_iter' : [100,1000,2500,5000]
    }
]

### Fully featured model

In [5]:
model = LogisticRegression()
model.fit(XTrain0,YTrain0)
print(model.score(XTest0,YTest0))

1.0


In [6]:
predictions0 = model.predict(XTest0)
print(metrics.confusion_matrix(YTest0,predictions0))

[[375   0]
 [  0 169]]


### Year, Circuit and Avg Lap model

In [13]:
# Creation and grid search
model1 = LogisticRegression()

clf =  GridSearchCV(model1,param_grid=paramGrid, cv=3, verbose=True, n_jobs=1)


In [14]:
bestClf = clf.fit(XTrain1,YTrain1)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


2160 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Mich\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Mich\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Mich\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------

In [16]:
bestClf.best_estimator_

LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')

In [17]:
# Optimizing hyperparameters
model1 = LogisticRegression(C=0.0001, penalty='l1', solver='liblinear')
model1.fit(XTrain1,YTrain1)
print(model1.score(XTest1,YTest1))

0.6893382352941176


In [18]:
predictions1 = model1.predict(XTest1)
print(metrics.confusion_matrix(YTest1,predictions1))

[[375   0]
 [169   0]]


### Circuit and Avg Lap model

In [10]:
model2 = LogisticRegression()
model2.fit(XTrain2,YTrain2)
print(model2.score(XTest2,YTest2))

0.6893382352941176


In [11]:
predictions2 = model2.predict(XTest2)
print(metrics.confusion_matrix(YTest2,predictions2))

[[375   0]
 [169   0]]


# Testing