# Partie prédictive:
## Vous mettrez en place un modèle de regression linéaire qui permet d'estimer le temps de trajet en fonction de sa distance.

In [1]:
import sys
sys.path.append("../src")

import pandas as pd

from functions import get_model_metrics, get_r2_rmse
from sklearn.linear_model import LinearRegression, Ridge, Lasso

pd.set_option('display.float_format', lambda x: f'{x:,.4f}')

df = pd.read_csv("../data/02_intermediate/train.csv")
#df = pd.read_csv("../data/02_intermediate/dirty_train.csv")

In [2]:
df.describe()

Unnamed: 0,vendor_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_dayofweek,pickup_part4h,distance_km,km_per_hour
count,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0,1432760.0
mean,1.5339,-73.9754,40.752,-73.9744,40.7522,803.7836,3.0508,3.0192,3.1304,14.1368
std,0.4989,0.0682,0.0285,0.0677,0.033,623.9046,1.9526,1.5744,3.1486,7.489
min,1.0,-121.9333,34.3597,-121.9333,34.3597,1.0,0.0,0.0,0.0,0.0
25%,1.0,-73.992,40.738,-73.9914,40.7363,393.0,1.0,2.0,1.2201,9.0
50%,2.0,-73.9819,40.7545,-73.9799,40.7546,652.0,3.0,3.0,2.0575,13.0
75%,2.0,-73.9679,40.7685,-73.9634,40.7699,1045.0,5.0,4.0,3.7257,18.0
max,2.0,-61.3355,43.9118,-61.3355,43.9118,39770.0,6.0,5.0,19.9999,150.0


### Régression linéaire à un paramètre

In [3]:
y = df.trip_duration
df_features = ["distance_km"]
X = df[df_features]

In [4]:
get_model_metrics(LinearRegression(), X, y)


β1 de distance_km : 139.341
β0 (intercept_) : 367.681

Training set : R2 = 0.493, RMSE = 445.235
Testing  set : R2 = 0.496, RMSE = 439.209


## Modèle multi linéaire
### Modèle 1
#### Hypothèse à vérifier : absence de colinéarité

In [5]:
df[[
    "trip_duration",
    "distance_km",
    "pickup_dayofweek",
    "pickup_part4h"
]].corr()


Unnamed: 0,trip_duration,distance_km,pickup_dayofweek,pickup_part4h
trip_duration,1.0,0.7025,-0.0327,0.0299
distance_km,0.7025,1.0,0.0136,-0.0249
pickup_dayofweek,-0.0327,0.0136,1.0,-0.0853
pickup_part4h,0.0299,-0.0249,-0.0853,1.0


La matrix de corrélation ci-dessus nous apprend que :
- Les coefficients de corrélation sont inférieur à 0.8.
- La distance est la variable la plus corrélé, tandis que que les deux autres variables ont un impacts assez faible
- **Hypothèse vérifiée** : absence de colinéarité entre les variables explicatives

### Choix des features :
- *distance* : indicateur pincipale pour calculer la durée du trajet
- *pickup_dayofweek* : on suppose que le jour à une influence sur l'état du trafic et donc sur la durée du trajet
- *pickup_part4h* :on suppose que le moment de la journée à une influence sur l'état du trafic et donc sur la durée du trajet

In [6]:
y = df.trip_duration
df_features = [
    "distance_km",
    "pickup_dayofweek",
    "pickup_part4h"
]
X = df[df_features]

## Évaluation du modèle
### LinearRegression()

In [7]:
get_model_metrics(LinearRegression(), X, y)


β1 de distance_km : 139.664
β1 de pickup_dayofweek : -12.314
β1 de pickup_part4h : 17.583
β0 (intercept_) : 351.128

Training set : R2 = 0.497, RMSE = 443.596
Testing  set : R2 = 0.499, RMSE = 437.603


#### Liste des alpha à utiliser pour les Lasso() et les Ridge()

In [8]:
alphas = [200*i+1 for i in range(6)]

### Lasso() avec différents alpha

In [9]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Lasso(alpha=a), X, y)
    print("\n")


Avec Alpha = 1
β1 de distance_km : 139.557
β1 de pickup_dayofweek : -12.075
β1 de pickup_part4h : 17.199
β0 (intercept_) : 351.896

Training set : R2 = 0.497, RMSE = 443.597
Testing  set : R2 = 0.499, RMSE = 437.601


Avec Alpha = 201
β1 de distance_km : 119.089
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 431.078

Training set : R2 = 0.483, RMSE = 449.783
Testing  set : R2 = 0.486, RMSE = 443.472


Avec Alpha = 401
β1 de distance_km : 98.938
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 494.16

Training set : R2 = 0.451, RMSE = 463.072
Testing  set : R2 = 0.455, RMSE = 456.558


Avec Alpha = 601
β1 de distance_km : 78.787
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 557.241

Training set : R2 = 0.4, RMSE = 484.383
Testing  set : R2 = 0.403, RMSE = 477.74


Avec Alpha = 801
β1 de distance_km : 58.636
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 620.323

Training set : R2 =

### Ridge() avec différents alpha

In [10]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Ridge(alpha=a), X, y)
    print("\n")

Avec Alpha = 1
β1 de distance_km : 139.664
β1 de pickup_dayofweek : -12.314
β1 de pickup_part4h : 17.583
β0 (intercept_) : 351.128

Training set : R2 = 0.497, RMSE = 443.596
Testing  set : R2 = 0.499, RMSE = 437.603


Avec Alpha = 201
β1 de distance_km : 139.662
β1 de pickup_dayofweek : -12.313
β1 de pickup_part4h : 17.581
β0 (intercept_) : 351.138

Training set : R2 = 0.497, RMSE = 443.596
Testing  set : R2 = 0.499, RMSE = 437.603


Avec Alpha = 401
β1 de distance_km : 139.659
β1 de pickup_dayofweek : -12.313
β1 de pickup_part4h : 17.58
β0 (intercept_) : 351.148

Training set : R2 = 0.497, RMSE = 443.596
Testing  set : R2 = 0.499, RMSE = 437.602


Avec Alpha = 601
β1 de distance_km : 139.657
β1 de pickup_dayofweek : -12.312
β1 de pickup_part4h : 17.579
β0 (intercept_) : 351.158

Training set : R2 = 0.497, RMSE = 443.596
Testing  set : R2 = 0.499, RMSE = 437.602


Avec Alpha = 801
β1 de distance_km : 139.654
β1 de pickup_dayofweek : -12.312
β1 de pickup_part4h : 17.578
β0 (intercept_) 

 # Modèle 2

In [11]:
y = df.trip_duration
df_features = [
    "distance_km",
    "pickup_dayofweek"
]
X = df[df_features]

## Évaluation du modèle
### LinearRegression()

In [12]:
get_model_metrics(LinearRegression(), X, y)

β1 de distance_km : 139.456
β1 de pickup_dayofweek : -13.514
β0 (intercept_) : 408.545

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.43


### Lasso() avec différents alpha

In [13]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Lasso(alpha=a), X, y)
    print("\n")


Avec Alpha = 1
β1 de distance_km : 139.353
β1 de pickup_dayofweek : -13.249
β0 (intercept_) : 408.06

Training set : R2 = 0.495, RMSE = 444.453
Testing  set : R2 = 0.498, RMSE = 438.428


Avec Alpha = 201
β1 de distance_km : 119.089
β1 de pickup_dayofweek : -0.0
β0 (intercept_) : 431.078

Training set : R2 = 0.483, RMSE = 449.783
Testing  set : R2 = 0.486, RMSE = 443.472


Avec Alpha = 401
β1 de distance_km : 98.938
β1 de pickup_dayofweek : -0.0
β0 (intercept_) : 494.16

Training set : R2 = 0.451, RMSE = 463.072
Testing  set : R2 = 0.455, RMSE = 456.558


Avec Alpha = 601
β1 de distance_km : 78.787
β1 de pickup_dayofweek : -0.0
β0 (intercept_) : 557.241

Training set : R2 = 0.4, RMSE = 484.383
Testing  set : R2 = 0.403, RMSE = 477.74


Avec Alpha = 801
β1 de distance_km : 58.636
β1 de pickup_dayofweek : -0.0
β0 (intercept_) : 620.323

Training set : R2 = 0.328, RMSE = 512.717
Testing  set : R2 = 0.331, RMSE = 506.004


Avec Alpha = 1001
β1 de distance_km : 38.486
β1 de pickup_dayofweek

### Ridge() avec différents alpha

In [14]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Ridge(alpha=a), X, y)
    print("\n")

Avec Alpha = 1
β1 de distance_km : 139.456
β1 de pickup_dayofweek : -13.514
β0 (intercept_) : 408.545

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.43


Avec Alpha = 201
β1 de distance_km : 139.453
β1 de pickup_dayofweek : -13.513
β0 (intercept_) : 408.55

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.429


Avec Alpha = 401
β1 de distance_km : 139.451
β1 de pickup_dayofweek : -13.513
β0 (intercept_) : 408.556

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.429


Avec Alpha = 601
β1 de distance_km : 139.448
β1 de pickup_dayofweek : -13.512
β0 (intercept_) : 408.562

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.429


Avec Alpha = 801
β1 de distance_km : 139.446
β1 de pickup_dayofweek : -13.511
β0 (intercept_) : 408.567

Training set : R2 = 0.495, RMSE = 444.452
Testing  set : R2 = 0.497, RMSE = 438.429


Avec Alpha = 1001
β1 de distance_km : 139.443
β1 

# Modèle 3

In [15]:
y = df.trip_duration
df_features = [
    "distance_km",
    "pickup_part4h"
]
X = df[df_features]

## Évaluation du modèle
### LinearRegression()

In [16]:
get_model_metrics(LinearRegression(), X, y)

β1 de distance_km : 139.576
β1 de pickup_part4h : 18.877
β0 (intercept_) : 309.934

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


### Lasso() avec différents alpha

In [17]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Lasso(alpha=a), X, y)
    print("\n")


Avec Alpha = 1
β1 de distance_km : 139.47
β1 de pickup_part4h : 18.468
β0 (intercept_) : 311.501

Training set : R2 = 0.495, RMSE = 444.244
Testing  set : R2 = 0.498, RMSE = 438.244


Avec Alpha = 201
β1 de distance_km : 119.089
β1 de pickup_part4h : 0.0
β0 (intercept_) : 431.078

Training set : R2 = 0.483, RMSE = 449.783
Testing  set : R2 = 0.486, RMSE = 443.472


Avec Alpha = 401
β1 de distance_km : 98.938
β1 de pickup_part4h : 0.0
β0 (intercept_) : 494.16

Training set : R2 = 0.451, RMSE = 463.072
Testing  set : R2 = 0.455, RMSE = 456.558


Avec Alpha = 601
β1 de distance_km : 78.787
β1 de pickup_part4h : 0.0
β0 (intercept_) : 557.241

Training set : R2 = 0.4, RMSE = 484.383
Testing  set : R2 = 0.403, RMSE = 477.74


Avec Alpha = 801
β1 de distance_km : 58.636
β1 de pickup_part4h : 0.0
β0 (intercept_) : 620.323

Training set : R2 = 0.328, RMSE = 512.717
Testing  set : R2 = 0.331, RMSE = 506.004


Avec Alpha = 1001
β1 de distance_km : 38.486
β1 de pickup_part4h : 0.0
β0 (intercept_) 

### Ridge() avec différents alpha

In [18]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Ridge(alpha=a), X, y)
    print("\n")

Avec Alpha = 1
β1 de distance_km : 139.576
β1 de pickup_part4h : 18.877
β0 (intercept_) : 309.935

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


Avec Alpha = 201
β1 de distance_km : 139.573
β1 de pickup_part4h : 18.875
β0 (intercept_) : 309.947

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


Avec Alpha = 401
β1 de distance_km : 139.571
β1 de pickup_part4h : 18.874
β0 (intercept_) : 309.959

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


Avec Alpha = 601
β1 de distance_km : 139.568
β1 de pickup_part4h : 18.872
β0 (intercept_) : 309.971

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


Avec Alpha = 801
β1 de distance_km : 139.566
β1 de pickup_part4h : 18.871
β0 (intercept_) : 309.983

Training set : R2 = 0.495, RMSE = 444.243
Testing  set : R2 = 0.498, RMSE = 438.246


Avec Alpha = 1001
β1 de distance_km : 139.563
β1 de pickup_part4h :

# Modèle 4

In [19]:
y = df.trip_duration
df_features = [
    "pickup_dayofweek",
    "pickup_part4h"
]
X = df[df_features]

## Évaluation du modèle
### LinearRegression()

In [20]:
get_model_metrics(LinearRegression(), X, y)

β1 de pickup_dayofweek : -9.686
β1 de pickup_part4h : 10.902
β0 (intercept_) : 800.502

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


### Lasso() avec différents alpha

In [21]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Lasso(alpha=a), X, y)
    print("\n")


Avec Alpha = 1
β1 de pickup_dayofweek : -9.45
β1 de pickup_part4h : 10.523
β0 (intercept_) : 800.925

Training set : R2 = 0.002, RMSE = 624.686
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 201
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 803.882

Training set : R2 = 0.0, RMSE = 625.251
Testing  set : R2 = -0.0, RMSE = 618.488


Avec Alpha = 401
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 803.882

Training set : R2 = 0.0, RMSE = 625.251
Testing  set : R2 = -0.0, RMSE = 618.488


Avec Alpha = 601
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 803.882

Training set : R2 = 0.0, RMSE = 625.251
Testing  set : R2 = -0.0, RMSE = 618.488


Avec Alpha = 801
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (intercept_) : 803.882

Training set : R2 = 0.0, RMSE = 625.251
Testing  set : R2 = -0.0, RMSE = 618.488


Avec Alpha = 1001
β1 de pickup_dayofweek : -0.0
β1 de pickup_part4h : 0.0
β0 (in

### Ridge() avec différents alpha

In [22]:
for a in alphas:
    print(f"Avec Alpha = {a}")
    get_model_metrics(Ridge(alpha=a), X, y)
    print("\n")

Avec Alpha = 1
β1 de pickup_dayofweek : -9.686
β1 de pickup_part4h : 10.902
β0 (intercept_) : 800.502

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 201
β1 de pickup_dayofweek : -9.685
β1 de pickup_part4h : 10.901
β0 (intercept_) : 800.503

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 401
β1 de pickup_dayofweek : -9.685
β1 de pickup_part4h : 10.9
β0 (intercept_) : 800.504

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 601
β1 de pickup_dayofweek : -9.685
β1 de pickup_part4h : 10.9
β0 (intercept_) : 800.505

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 801
β1 de pickup_dayofweek : -9.684
β1 de pickup_part4h : 10.899
β0 (intercept_) : 800.506

Training set : R2 = 0.002, RMSE = 624.685
Testing  set : R2 = 0.002, RMSE = 617.928


Avec Alpha = 1001
β1 de pickup_dayofweek : -9.684
β

## Conclusion

- On remarque que seul la distance a vraiment des répercutions sur la distance.
- Cela peut-être dut au fait que les distances sont calculés à vol d'oiseau.
- De manière générale, avec un alpha supérieur à 10 le modèle devient moins fiable.