In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score
import sklearn.metrics
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
scaler = StandardScaler()

In [3]:
data = pd.read_csv('final_df_merged.csv')
df=data.copy()

In [4]:
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 85 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   season                     9984 non-null   int64  
 1   round                      9984 non-null   int64  
 2   driver                     9984 non-null   object 
 3   grid                       9984 non-null   int64  
 4   points                     9984 non-null   float64
 5   podium                     9984 non-null   int64  
 6   constructor_points         9984 non-null   int64  
 7   constructor_wins           9984 non-null   int64  
 8   constructor_standings_pos  9984 non-null   int64  
 9   driver_points              9984 non-null   int64  
 10  driver_wins                9984 non-null   int64  
 11  driver_standings_pos       9984 non-null   int64  
 12  qualifying_time            9984 non-null   float64
 13  weather_warm               9984 non-null   bool 

Unnamed: 0,season,round,driver,grid,points,podium,constructor_points,constructor_wins,constructor_standings_pos,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1990,1,berger,1,0.0,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1990,1,martini,2,0.0,7,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1990,1,alesi,4,6.0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1990,1,senna,5,9.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1990,1,piquet,6,3.0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Le but sera de tester plusieurs combinaison de variable explicative et choisir celle qui permettra d'avoir le meilleur 
score. On prendra comme critère (comme score) le pourcentage de course correctement prédites en 2020. 

In [13]:
# la fonction score: 

def score_regression(model,xvar):
    score=0
    les_circuits=df[df['season']==2020]["round"].unique()
    for circuit in les_circuits:
        test=df[(df['season']==2020) & (df['round']==circuit)]
        X_test=test[xvar]
        y_test=test["podium"]
        
        X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
        
        #On fait les prédictions
        
        prediction_df = pd.DataFrame(model.predict(X_test), columns = ['results'])
        prediction_df['podium'] = y_test.reset_index(drop = True)
        prediction_df['actual'] = prediction_df['podium'].map(lambda x: 1 if x == 1 else 0)
        prediction_df.sort_values('results', ascending = True, inplace = True)
        prediction_df.reset_index(inplace = True, drop = True)
        prediction_df['predicted'] = prediction_df.index
        prediction_df['predicted'] = prediction_df.predicted.map(lambda x: 1 if x == 0 else 0)
        
        score += precision_score(prediction_df['actual'],prediction_df['predicted'])
    
    model_score = score / df[df.season == 2020]['round'].unique().max()
    return model_score

In [None]:
print()

## Régression linéaire 

In [19]:
# On va calculer le score du modèle pour plusieurs combinaisons de features ( on peut l'écrire comme fonction du paramètre k)

xvar_total=df.columns.drop(["driver",'podium'])
xvar_random=[]
xvar_comparison={'xvar':[],'score':[]}

for i in range(10):
    xvar_random.append(random.choices(xvar_total,k=10))
    
for xvar in xvar_random:
    train = df[df["season"] <2020]
    X_train = train[xvar]
    y_train = train["podium"]

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    
    model = LinearRegression(fit_intercept = 'True')
    model.fit(X_train, y_train)
    
    model_score=score_regression(model,xvar)
    
    
    xvar_comparison['xvar'].append(xvar)
    xvar_comparison['score'].append(model_score)
    



On peut faire notre regression en prenant en compte toutes les features (Cependant il y en a surement certaines dont on aura pas la valeur avant la course pour la prédiction) 

In [13]:
train = df[df["season"] <2020]
X_train = train.drop(columns=["driver","podium"])
y_train = train["podium"]
    
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

model = LinearRegression(fit_intercept = 'True')
model.fit(X_train, y_train)

print(score_regression(model,train.columns.drop(["driver","podium"])))

0.8235294117647058


On Obtient 82% ce qui me paraît énorme... 

## SVM Classifier 

L'intérêt est de comparer cette méthode dans le cadre d'une regression avec celle utilisée dans le cadre d'une classification 

In [15]:
# On va calculer le score du modèle pour plusieurs combinaisons de features ( on peut l'écrire comme fonction du paramètre k)

train = df[df["season"] <2020]
X_train = train.drop(columns=["driver","podium"])
y_train = train["podium"]
    
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)


comparison_dict ={'model':[],
                  'params': [],
                  'score': []}

xvar_total=df.columns.drop(["driver",'podium'])
xvar=random.choices(xvar_total,k=2)


params={'gamma': np.logspace(-4, -1, 2),
        'C': np.logspace(-2, 1, 2),
       'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 

X_train = train[xvar]
y_train = train["podium"]

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

for gamma in params['gamma']:
    for c in params['C']:
        for kernel in params['kernel']:
            model_params = (gamma, c, kernel)
            model = svm.SVC(probability = True, gamma = gamma, C = c, kernel = kernel )
            model.fit(X_train, y_train)
            
            model_score = score_regression(model,xvar)
            
            comparison_dict['model'].append('svm_classifier')
            comparison_dict['params'].append(model_params)
            comparison_dict['score'].append(model_score)
    

Index(['season', 'round', 'grid', 'points', 'constructor_points',
       'constructor_wins', 'constructor_standings_pos', 'driver_points',
       'driver_wins', 'driver_standings_pos', 'qualifying_time',
       'weather_warm', 'weather_cold', 'weather_dry', 'weather_wet',
       'weather_cloudy', 'age', 'circuit_id_adelaide',
       'circuit_id_albert_park', 'circuit_id_americas', 'circuit_id_bahrain',
       'circuit_id_catalunya', 'circuit_id_estoril', 'circuit_id_galvez',
       'circuit_id_hockenheimring', 'circuit_id_hungaroring',
       'circuit_id_imola', 'circuit_id_indianapolis', 'circuit_id_interlagos',
       'circuit_id_istanbul', 'circuit_id_magny_cours',
       'circuit_id_marina_bay', 'circuit_id_monaco', 'circuit_id_monza',
       'circuit_id_nurburgring', 'circuit_id_osterreichring',
       'circuit_id_red_bull_ring', 'circuit_id_rodriguez', 'circuit_id_sepang',
       'circuit_id_shanghai', 'circuit_id_silverstone', 'circuit_id_sochi',
       'circuit_id_spa', 'circui

NameError: name 'train' is not defined

In [90]:
print(comparison_dict)

{'model': ['svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier', 'svm_classifier'], 'params': [(0.0001, 0.01, 'linear'), (0.0001, 0.01, 'poly'), (0.0001, 0.01, 'rbf'), (0.0001, 0.01, 'sigmoid'), (0.0001, 10.0, 'linear'), (0.0001, 10.0, 'poly'), (0.0001, 10.0, 'rbf'), (0.0001, 10.0, 'sigmoid'), (0.1, 0.01, 'linear'), (0.1, 0.01, 'poly'), (0.1, 0.01, 'rbf'), (0.1, 0.01, 'sigmoid'), (0.1, 10.0, 'linear'), (0.1, 10.0, 'poly'), (0.1, 10.0, 'rbf'), (0.1, 10.0, 'sigmoid')], 'score': [0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619, 0.