# Structurer un workflow pour le déployer en production

<img src="../images/workflow.png" alt="image workflow">

In [None]:
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
data = sns.load_dataset('tips')

In [None]:
data.info()

In [None]:
data.head(2)

In [None]:
y = data['tip']

In [None]:
X = data.drop(columns=['tip'])

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

# Iteration 1 - categoric features (get_dummies)

In [None]:
X_train

In [None]:
y_train

In [None]:
#lr.fit(X_train, y_train)

In [None]:
X_train.sample(5)

In [None]:
X_train_dummies = pd.get_dummies(X_train)

In [None]:
X_train_dummies.sample(4)

In [None]:
lr_train = lr.fit(X_train_dummies, y_train)

In [None]:
X_test_dummies = pd.get_dummies(X_test)

In [None]:
lr_train.score(X_test_dummies, y_test)

# Iteration 2 - categoric + numeric features

In [None]:
X_train_2,X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.3)

In [None]:
sns.histplot(data=X_train_2[["total_bill"]], kde=True);

In [None]:
sns.histplot(data=X_train_2[["size"]], kde=True)

## Scaler

### Numerical features

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train_2[["size","total_bill"]])
X_train_2_scaled = scaler.transform(X_train_2[["size","total_bill"]])

In [None]:
X_train_2_scaled_df = pd.DataFrame(X_train_2_scaled).rename(columns={0: "size_scaled", 1: "total_bill_scaled"})

In [None]:
X_train_2.head(20)

In [None]:
X_train_2_scaled_df

### Categorical Features

In [None]:
X_train_2[["sex","smoker","day","time"]].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()

In [None]:
ohe.fit_transform(X_train_2[["sex","smoker","day","time"]])

In [None]:
X_train_2_ohe = ohe.fit_transform(X_train_2[["sex","smoker","day","time"]])

In [None]:
X_train_2_ohe_df = pd.DataFrame.sparse.from_spmatrix(X_train_2_ohe)

In [145]:
X_train_2_ohe_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
X_train_2_prep_df = pd.concat([X_train_2_ohe_df, X_train_2_scaled_df], axis=1)

In [None]:
X_train_2_prep_df

In [None]:
lr_2 = LinearRegression()

In [None]:
lr_2.fit(X_train_2_prep_df,y_train_2)

In [147]:
#===> Il faut scalé le X_test_2_prep_df. Avec la même methodologie que le X_train_2, sans faire de dataLeakeage. 
# ==> Creéer les methodes appropriées pour réutiliser la logique.
X_test_2_scaled = scaler.transform(X_test_2[["size","total_bill"]])
X_test_2_scaled_df = pd.DataFrame(X_test_2_scaled).rename(columns={0: "size_scaled", 1: "total_bill_scaled"})
X_test_2_ohe = ohe.fit(X_train_2[["sex","smoker","day","time"]]).transform(X_test_2[["sex","smoker","day","time"]])
X_test_2_ohe_df = pd.DataFrame.sparse.from_spmatrix(X_test_2_ohe)
X_test_2_ohe_df.head(2)
X_test_2_prep_df = pd.concat([X_test_2_ohe_df, X_test_2_scaled_df], axis=1)
X_test_2_prep_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,size_scaled,total_bill_scaled
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.939394
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.2,0.053949
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.2,0.264692
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.2,0.230946
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.2,0.349862
...,...,...,...,...,...,...,...,...,...,...,...,...
69,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.6,0.303030
70,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.6,0.943067
71,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.2,-0.034435
72,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.2,0.055785


In [148]:
lr_2.score(X_test_2_prep_df, y_test_2)



0.4517390901359032

In [None]:
# Afficher un nuage de point qui révéle les informations suivantes : y_true vs y_predict en fonction de total_bill


In [None]:
# Re-testester avec un StandardScaler => https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html



In [None]:
# Afficher l'hétéroscédasticité ou l'homoscédasticité des erreurs.

In [None]:
# Afficher la distribution des erreurs. (Normalité des erreus).

In [None]:
# Créer un package à l'aide https://pypi.org/project/setuptools/