In [21]:
import numpy as np
import pandas as pd
import datetime
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [2]:
np.set_printoptions(precision=4)

In [18]:
data = pd.read_csv('final_df_merged.csv')

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 85 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   season                     9984 non-null   int64  
 1   round                      9984 non-null   int64  
 2   driver                     9984 non-null   object 
 3   grid                       9984 non-null   int64  
 4   points                     9984 non-null   float64
 5   podium                     9984 non-null   int64  
 6   constructor_points         9984 non-null   int64  
 7   constructor_wins           9984 non-null   int64  
 8   constructor_standings_pos  9984 non-null   int64  
 9   driver_points              9984 non-null   int64  
 10  driver_wins                9984 non-null   int64  
 11  driver_standings_pos       9984 non-null   int64  
 12  qualifying_time            9984 non-null   float64
 13  weather_warm               9984 non-null   bool 

# 1- Dataset d'entraînement 

In [10]:
df = data.copy()
df.podium = df.podium.map(lambda x: 1 if x == 1 else 0)

train = df[df.season<2020]
X_train = train.drop(['driver', 'podium'], axis = 1)
y_train = train.podium

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

In [None]:
comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

# 2- Classification

In [5]:
def score_classification(model):
    score = 0
    for circuit in df[df.season == 2020]['round'].unique():

        test = df[(df.season == 2020) & (df['round'] == circuit)]
        X_test = test.drop(['driver', 'podium'], axis = 1)
        y_test = test.podium

        #scaling
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

        # make predictions
        prediction_df = pd.DataFrame(model.predict_proba(X_test), columns = ['proba_0', 'proba_1'])
        prediction_df['actual'] = y_test.reset_index(drop = True)
        prediction_df.sort_values('proba_1', ascending = False, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)

        score += precision_score(prediction_df.actual, prediction_df.predicted)

    model_score = score / df[df.season == 2020]['round'].unique().max()
    return model_score

## SVM Classifier

Assez lent à faire tourner

In [8]:
params={'gamma': np.logspace(-4, -1, 20),
        'C': np.logspace(-2, 1, 20),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVC(probability = True, gamma = gamma, C = c, kernel = kernel )
            model.fit(X_train, y_train)
            
            model_score = score_classification(model)
            
            comparison_dict['model'].append('svm_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)

In [14]:
# Le score maximal est 
print(max(comparison_dict["score"]))
 
## En déduire les paramétres correspondant au score optimal 

0.8235294117647058


# 3- Régression

Le but sera de tester plusieurs combinaison de variable explicative et choisir celle qui permettra d'avoir le meilleur 
score. On prendra comme critère (comme score) le pourcentage de course correctement prédites en 2020.

In [31]:
# la fonction score: 

def score_regression(model,xvar):
    score=0
    les_circuits=df[df['season']==2020]["round"].unique()
    for circuit in les_circuits:
        test=df[(df['season']==2020) & (df['round']==circuit)]
        X_test=test[xvar]
        y_test=test["podium"]
        
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        
        #On fait les prédictions
        
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df['podium'].map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        
        score += precision_score(prediction_df['actual'],prediction_df['predicted'])
    
    model_score = score / df[df.season == 2020]['round'].unique().max()
    return model_score

## 3.1 Régression linéaire

Nous faisons une regression linéaire en utilisant toutes les features dont nous disposons. L'idée sera ensuite de comparer les résultats de ce modèle avec le SVM classifier.  

In [None]:
model = LinearRegression(fit_intercept = 'True')
model.fit(X_train, y_train)
model_score=score_regression(model,train.columns.drop(["driver","podium"])
comparison_dict['model'].append('regression_lineaire')
comparison_dict['params'].append('True')
comparison_dict['score'].append(model_score)

Maintenant nous pouvons nous interesser à la regression linéaire en prenant comme variables nous aidant à prédire des combinaisons de variable. Cela nous permettra de comprendre quelles variables sont les plus importantes pour prédire notre target, c'est à dire le gagnant de la course. 

In [33]:
xvar_total=df.columns.drop(["driver",'podium'])
xvar_series=pd.Series(xvar_total)

xvar_weather=list(xvar_series[xvar_series.str.contains("weather")])
xvar_nationality=list(xvar_series[xvar_series.str.contains("nationality")])
xvar_circuit=list(xvar_series[xvar_series.str.contains("circuit")])
xvar_constructor=list(xvar_series[xvar_series.str.contains("constructor")])
xvar_grid=list(["grid","constructor_standings_pos","driver_standings_pos","qualifying_time"])

xvar_comparison={'xvar':[],'score':[]}

xvar_combinaison=[xvar_weather,xvar_nationality,xvar_circuit,xvar_constructor,xvar_grid]
    
for xvar in xvar_combinaison:
    X_train_2 = train[xvar]
    y_train_2 = train["podium"]

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train_2), columns = X_train_2.columns)
    
    model = LinearRegression(fit_intercept = 'True')
    model.fit(X_train_2, y_train_2)
    
    model_score=score_regression(model,xvar)
    
    
    xvar_comparison['xvar'].append(xvar)
    xvar_comparison['score'].append(model_score)

In [35]:
def keywithmaxval(d):
     v=list(d.values())
     k=list(d.keys())
     return k[v.index(max(v))]
    
return 

  max(xvar_comparison, key=xvar_comparison.get)


TypeError: '>' not supported between instances of 'numpy.ndarray' and 'list'

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 85 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   season                     9984 non-null   int64  
 1   round                      9984 non-null   int64  
 2   driver                     9984 non-null   object 
 3   grid                       9984 non-null   int64  
 4   points                     9984 non-null   float64
 5   podium                     9984 non-null   int64  
 6   constructor_points         9984 non-null   int64  
 7   constructor_wins           9984 non-null   int64  
 8   constructor_standings_pos  9984 non-null   int64  
 9   driver_points              9984 non-null   int64  
 10  driver_wins                9984 non-null   int64  
 11  driver_standings_pos       9984 non-null   int64  
 12  qualifying_time            9984 non-null   float64
 13  weather_warm               9984 non-null   bool 