# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Régression linéaire</div></b>

In [10]:
import pandas as pd, numpy as np, seaborn as sns, warnings, os, sys, time, copy as cp
from datetime import datetime as dt
from matplotlib import pyplot as plt

import matplotlib.font_manager as fm
import plotly.express as px
import plotly.graph_objs as go

font1 = fm.FontProperties(size=20)
font2 = fm.FontProperties(size=24)

warnings.filterwarnings(action="ignore")

if int(str(sns.__version__).split('.')[1]) > 8 : 
    plt.style.use('seaborn-v0_8-darkgrid')
else:
    plt.style.use('seaborn-darkgrid')
    # plt.style.use('seaborn-poster')
sns.set(font_scale=3)
np.random.seed(123456789)

In [11]:
donnees = pd.read_csv('../donnees/Regressions/50_Startups.csv')

In [12]:
all_states = donnees['State'].unique()

In [13]:
donnees = pd.concat([
    donnees.drop(columns='State'),
    pd.get_dummies(donnees['State'], drop_first=True, prefix='State_', dtype=int)
], axis=1)

In [15]:
donnees.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State__Florida,State__New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
cible = 'Profit'
X, y = donnees.drop(columns=cible), donnees[cible]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Régression Linéaire</div></b>

Voici le code équivalent lorsqu’on utilise **Scikit-Learn**

L’équation normale calcule l’inverse de $X^{T} ⋅ X$, qui est une matrice $(n + 1) × (n + 1)$ (où n est le nombre de variables). La complexité algorithmique d’une inversion de matrice de taille n × n se situe entre $O(n^{2,4})$ et $O(n^{3})$, selon l’algorithme d’inversion utilisé. Autrement dit, si vous doublez le nombre de variables, le temps de calcul est grosso modo multiplié par un facteur compris entre $2^{2,4} = 5,3$ et $2^{3} = 8$.

In [20]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg.intercept_, lin_reg.coef_

(np.float64(42554.16761773237),
 array([ 7.73467193e-01,  3.28845975e-02,  3.66100259e-02, -9.59284160e+02,
         6.99369053e+02]))

In [21]:
lin_reg.intercept_ + np.dot(X_test.iloc[0], lin_reg.coef_)

np.float64(103015.2015979618)

In [22]:
y_pred = lin_reg.predict(X_test)

## <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Choix de la métrique de performance</div></b>

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Erreur moyenne absolute</div></b>

$MAE = \frac{1}{n} \sum_{i=1}^n \left| y_i - f(x_i)\right|$

In [23]:
from sklearn.metrics import median_absolute_error
median_absolute_error(y, lin_reg.predict(X))

np.float64(4639.750009102532)

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Erreur quadratique moyenne(mean absolute error)</div></b>

$MSE = \frac{1}{n} \sum_{i=1}^n (y_i-f(x_i))^2$

In [24]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, lin_reg.predict(X))

6576.087338393607

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Racine quarrée de l'erreur quadratique moyenne(mean squared error)</div></b>

$RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^n (y_i-f(x_i))^2}$

In [25]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y, lin_reg.predict(X))

81957374.24713448


### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>Erreur logarithmique quadratique moyenne(mean squared logarithmic error)</div></b>


$RMSLE = \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(y_i+1) - \log(f(x_i)+1))^2}$

In [26]:
from sklearn.metrics import mean_squared_log_error
mean_squared_log_error(y, lin_reg.predict(X))

0.035901084392585554

### <b><div style='padding:15px;background-color:#d8dcd6;color:#030aa7;font-size:120%;text-align: left'>$R^{2}$ coefficient de détermination linéaire de Pearson</div></b>

$RSE = \frac{\sum_{i=1}^n (y_i - f(x_i))^2}{\sum_{i=1}^n (y_i - \bar{y})^2}$ avec $\bar{y} = \frac{1}{n} \sum_{i=1}^n y_i$  

$R = \frac{\sum_{i=1}^n (y_i - \bar y) (f(x_i) - \overline{f(x)})}{\sqrt{\sum_{i=1}^n (y_i - \bar y)^2} \sqrt{\sum_{i=1}^n (f(x_i) - \overline{f(x)})^2 }}$

In [27]:
from sklearn.metrics import r2_score
print(r2_score(y, lin_reg.predict(X)))
print(r2_score(y, lin_reg.predict(X), multioutput='variance_weighted'))
print(r2_score(y, lin_reg.predict(X), multioutput='uniform_average'))
print(r2_score(y, lin_reg.predict(X), multioutput='raw_values'))
print(np.sqrt(r2_score(y, lin_reg.predict(X))))

0.9485223547171558
0.948522354717156
0.9485223547171558
[0.94852235]
0.9739211234577243
