
## Table des matières

1. [Importation des Bibliothèques](#importation-des-bibliothèques)
2. [Chargement des Données](#chargement-des-données)
3. [Préparation des Données](#préparation-des-données)
4. [Transformation des Données](#transformation-des-données)
5. [Division des Données](#division-des-données)
6. [Sélection des Caractéristiques](#sélection-des-caractéristiques)
7. [Filtrage des Caractéristiques](#filtrage-des-caractéristiques)
8. [Entraînement et Enregistrement des Modèles](#entra%C3%AEnement-et-enregistrement-des-mod%C3%A8les)
9. [Validation du Modèle](#validation-du-mod%C3%A8le)

---

## Importation des Bibliothèques

Nous commençons par importer toutes les bibliothèques nécessaires pour le traitement des données, la modélisation et l'enregistrement des résultats.


In [1]:
import datetime

In [2]:
start_time = datetime.datetime.now()

print(start_time)

2024-08-07 11:42:13.263803


In [3]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from joblib import parallel_backend
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from mlflow.models import validate_serving_input, convert_input_example_to_serving_input
import math

import warnings
warnings.filterwarnings('ignore')

## Chargement des Données
Définissons une fonction pour charger les données depuis le fichier CSV.

In [4]:
data_path = "C:/Users/Asus_M/Desktop/data.csv"

def load_data(data_path):
    data = pd.read_csv(data_path, index_col=0)
    return data

data = load_data(data_path)
data.head()

Unnamed: 0,PASSENGERS,FREIGHT,MAIL,DISTANCE,UNIQUE_CARRIER,AIRLINE_ID,UNIQUE_CARRIER_NAME,REGION,CARRIER,CARRIER_NAME,...,DEST,DEST_CITY_NAME,DEST_COUNTRY,DEST_COUNTRY_NAME,DEST_WAC,YEAR,QUARTER,MONTH,DISTANCE_GROUP,CLASS
0,0.0,0.0,0.0,29.0,AMQ,20201,Ameristar Air Cargo,I,AMQ,Ameristar Air Cargo,...,YQG,"Windsor, Canada",CA,Canada,936,2010,2,6,1,P
1,0.0,0.0,0.0,29.0,AMQ,20201,Ameristar Air Cargo,I,AMQ,Ameristar Air Cargo,...,YIP,"Detroit, MI",US,United States,43,2010,1,3,1,P
2,0.0,0.0,0.0,29.0,AMQ,20201,Ameristar Air Cargo,I,AMQ,Ameristar Air Cargo,...,YIP,"Detroit, MI",US,United States,43,2010,2,6,1,P
3,0.0,0.0,0.0,29.0,AMQ,20201,Ameristar Air Cargo,I,AMQ,Ameristar Air Cargo,...,YIP,"Detroit, MI",US,United States,43,2010,3,8,1,P
4,0.0,0.0,0.0,29.0,AMQ,20201,Ameristar Air Cargo,I,AMQ,Ameristar Air Cargo,...,YIP,"Detroit, MI",US,United States,43,2010,3,9,1,P


---

## Préparation des Données
Préparons les données en nettoyant et en ajustant les types de données.

In [5]:
def prepare_data(data):
    data.columns = data.columns.str.replace(' ', '', regex=False)
    colonnes_avec_ID = [col for col in data.columns if 'ID' in col]
    data.drop(columns=colonnes_avec_ID, axis=1, inplace=True)
    
    # Conversion des colonnes catégorielles en objets
    cat_columns = ['CARRIER_GROUP', 'CARRIER_GROUP_NEW', 'ORIGIN_WAC', 'DEST_WAC', 'YEAR', 'QUARTER', 'MONTH', 'DISTANCE_GROUP']
    data[cat_columns] = data[cat_columns].astype(object)
    
    # Suppression des lignes avec PASSENGERS, FREIGHT, et MAIL tous égaux à 0
    lignes_zero_valeurs = data[(data['PASSENGERS'] == 0) & (data['FREIGHT'] == 0) & (data['MAIL'] == 0)].index
    data.drop(lignes_zero_valeurs, inplace=True)
    
    # Suppression des valeurs manquantes
    data.dropna(inplace=True)
    
    return data

prepared_data = prepare_data(data)
prepared_data.head()
prepared_data.shape

(54942, 26)

---

## Transformation des Données
Transformons les données en encodant les variables catégorielles et en appliquant une transformation logarithmique aux données numériques.

In [6]:
def transform_data(prepared_data):
    label_encoder = LabelEncoder()
    cat_data = prepared_data.select_dtypes(include='object')
    num_data = prepared_data.select_dtypes(exclude='object')
    
    # Transformation logarithmique
    num_data['Log_PASSENGERS'] = np.log1p(num_data['PASSENGERS'])
    num_data['Log_FREIGHT'] = np.log1p(num_data['FREIGHT'])
    num_data['Log_MAIL'] = np.log1p(num_data['MAIL'])
    num_data['Log_DISTANCE'] = np.log1p(num_data['DISTANCE'])
    num_data.drop(columns=['PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE'], inplace=True)
    
    # Encodage des variables catégorielles
    for column in cat_data.columns:
        cat_data[column] = label_encoder.fit_transform(cat_data[column])
    
    data = pd.concat([num_data, cat_data], axis=1)
    return data

transformed_data = transform_data(prepared_data)
transformed_data.head()
transformed_data.shape
transformed_data.columns

Index(['Log_PASSENGERS', 'Log_FREIGHT', 'Log_MAIL', 'Log_DISTANCE',
       'UNIQUE_CARRIER', 'UNIQUE_CARRIER_NAME', 'REGION', 'CARRIER',
       'CARRIER_NAME', 'CARRIER_GROUP', 'CARRIER_GROUP_NEW', 'ORIGIN',
       'ORIGIN_CITY_NAME', 'ORIGIN_COUNTRY', 'ORIGIN_COUNTRY_NAME',
       'ORIGIN_WAC', 'DEST', 'DEST_CITY_NAME', 'DEST_COUNTRY',
       'DEST_COUNTRY_NAME', 'DEST_WAC', 'YEAR', 'QUARTER', 'MONTH',
       'DISTANCE_GROUP', 'CLASS'],
      dtype='object')

---

## Division des Données
Divisons les données en ensembles d'entraînement et de test.

In [7]:
def split_data(transformed_data):
    X = transformed_data.drop(columns=["Log_PASSENGERS", "Log_FREIGHT", "Log_MAIL"])
    y = transformed_data[["Log_PASSENGERS", "Log_FREIGHT", "Log_MAIL"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(transformed_data)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(43953, 23)
(43953, 3)
(10989, 23)
(10989, 3)


---

## Sélection des Caractéristiques
Utilisons la méthode SequentialFeatureSelector pour sélectionner les meilleures caractéristiques.

In [8]:
def select_features(X_train, y_train, k_features='best', n_jobs=-1, cv=3, scoring='neg_mean_squared_error', verbose=2):
    etr = ExtraTreesRegressor(n_jobs=n_jobs,random_state=42)
    sfs = SFS(
        etr,
        k_features=k_features,
        forward=False,
        floating=False,
        verbose=verbose,
        scoring=scoring,
        cv=cv,
        n_jobs=n_jobs
    )
    
    # Perform feature selection using joblib for parallel processing
    with parallel_backend('threading', n_jobs=n_jobs):
        sfs = sfs.fit(X_train, y_train)
    
    # Get the selected feature indices
    selected_feature_indices = sfs.k_feature_idx_
    
    if isinstance(X_train, pd.DataFrame):
        feature_names = X_train.columns[list(selected_feature_indices)]
        return list(selected_feature_indices), feature_names.tolist()
    else:
        return list(selected_feature_indices)

selected_indices, selected_feature_names = select_features(X_train, y_train)
print("Indices des caractéristiques sélectionnées :", selected_indices)
print("Noms des caractéristiques sélectionnées :", selected_feature_names)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  23 | elapsed:  2.7min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  23 out of  23 | elapsed:  4.1min finished

[2024-08-07 11:46:37] Features: 22/1 -- score: -0.749452776786056[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  22 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  22 out of  22 | elapsed:  3.2min finished

[2024-08-07 11:49:49] Features: 21/1 -- score: -0.7465249047195583[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:  1.8min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  3.0min finished

[2024-08-07 11:52:47] Features: 20/1 -- score: -0.7443903165484197[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs

Indices des caractéristiques sélectionnées : [0, 3, 5, 7, 8, 9, 12, 13, 14, 18, 20, 21, 22]
Noms des caractéristiques sélectionnées : ['Log_DISTANCE', 'REGION', 'CARRIER_NAME', 'CARRIER_GROUP_NEW', 'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_WAC', 'DEST', 'DEST_CITY_NAME', 'YEAR', 'MONTH', 'DISTANCE_GROUP', 'CLASS']


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.3s finished

[2024-08-07 12:13:25] Features: 1/1 -- score: -3.8728223161012885

---

## Filtrage des Caractéristiques
Filtrons les données d'entraînement et de test pour ne conserver que les caractéristiques sélectionnées.

In [9]:
def filter_features(X_train, X_test, selected_indices):
    if isinstance(X_train, pd.DataFrame):
        return X_train.iloc[:, selected_indices], X_test.iloc[:, selected_indices]
    else:
        return X_train[:, selected_indices], X_test[:, selected_indices]

X_train_selected, X_test_selected = filter_features(X_train, X_test, selected_indices)
print("X_train avec les caractéristiques sélectionnées :", X_train_selected.shape)
print("X_test avec les caractéristiques sélectionnées :", X_test_selected.shape)
print("y_train :", y_train.shape)
print("y_test :", y_test.shape)

X_train avec les caractéristiques sélectionnées : (43953, 13)
X_test avec les caractéristiques sélectionnées : (10989, 13)
y_train : (43953, 3)
y_test : (10989, 3)


## Enregistrement des features dans un fichier TXT :
Aprés l'etape de selection ,nous enregistrons les données avec les features selectionnées dans un fichier txt.

In [10]:
columns = X_train_selected.columns.tolist()

with open('features.txt', 'w') as file:
    for column in columns:
        file.write(f"{column}\n")

print("Les noms des colonnes ont été enregistrés dans features.txt.")    

Les noms des colonnes ont été enregistrés dans features.txt.


---

## Entraînement et Enregistrement des Modèles
Entraînons plusieurs modèles et enregistrons-les avec MLflow.

In [11]:
# Définir l'URI de suivi et l'expérience MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('experience1')

def train_and_log_model(model_name, model, X_train_selected, y_train, X_test_selected, y_test):
    with mlflow.start_run(run_name=model_name):
        # Entraîner le modèle
        model.fit(X_train_selected, y_train)
        
        # Prédictions
        predictions = model.predict(X_test_selected)
        
        # Calculer l'erreur quadratique moyenne et R2
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)  # Calculer RMSE
        r2 = r2_score(y_test, predictions)
        
        # Loguer les paramètres et les résultats
        mlflow.log_params(model.get_params())
        mlflow.log_metric("mean_squared_error", mse)
        mlflow.log_metric("root_mean_squared_error", rmse)
        mlflow.log_metric("R2", r2)
        
        # Loguer le modèle
        mlflow.sklearn.log_model(model, model_name)
        
        return r2, rmse

models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor()
}

best_model_name = None
best_r2 = -np.inf
best_rmse = np.inf

for model_name, model in models.items():
    r2, rmse = train_and_log_model(model_name, model, X_train_selected, y_train, X_test_selected, y_test)
    print(f"Modèle: {model_name}, R2: {r2:.4f}, RMSE: {rmse:.4f}")
    
    # Mettre à jour le meilleur modèle basé sur R2 et RMSE
    if r2 > best_r2 and rmse < best_rmse:
        best_r2 = r2
        best_rmse = rmse
        best_model_name = model_name

if best_model_name:
    print(f"\nLe meilleur modèle est '{best_model_name}' avec R2 = {best_r2:.4f} et RMSE = {best_rmse:.4f}")

    # Enregistrer le meilleur modèle
    with mlflow.start_run(run_name="Best_Model_Production") as best_run:
        best_model = models[best_model_name]
        mlflow.sklearn.log_model(best_model, "best_model")
        mlflow.log_metric("R2", best_r2)
        mlflow.log_metric("root_mean_squared_error", best_rmse)
        
        # Enregistrer le modèle dans le registre MLflow
        model_uri = f"runs:/{best_run.info.run_id}/best_model"
        mlflow.register_model(model_uri, "Best_Model")

        # Déplacer le modèle vers le stage de production
        client = MlflowClient()
        latest_version = client.get_latest_versions("Best_Model", stages=["None"])[0].version
        client.transition_model_version_stage(
            name="Best_Model",
            version=latest_version,
            stage="Production"
        )

    print(f"Le modèle '{best_model_name}' a été enregistré et mis en production.")
else:
    print("Aucun modèle n'a été sélectionné.")


2024/08/07 12:13:28 INFO mlflow.tracking.fluent: Experiment with name 'experience1' does not exist. Creating a new experiment.
2024/08/07 12:13:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression at: http://localhost:5000/#/experiments/995444263188306398/runs/9b436f8930a743d3adfc4541593898b3.
2024/08/07 12:13:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/995444263188306398.


Modèle: LinearRegression, R2: 0.1832, RMSE: 2.9935


2024/08/07 12:14:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestRegressor at: http://localhost:5000/#/experiments/995444263188306398/runs/2ebdf0293b6e4e99a0ab0d715ffbf9b5.
2024/08/07 12:14:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/995444263188306398.


Modèle: RandomForestRegressor, R2: 0.9263, RMSE: 0.8188


2024/08/07 12:14:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNeighborsRegressor at: http://localhost:5000/#/experiments/995444263188306398/runs/ec5dac052297454b8e9e599f66e7e199.
2024/08/07 12:14:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/995444263188306398.


Modèle: KNeighborsRegressor, R2: 0.8440, RMSE: 1.2745


2024/08/07 12:14:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run ExtraTreesRegressor at: http://localhost:5000/#/experiments/995444263188306398/runs/7804aba9c89a4213ba39fa23753ea05f.
2024/08/07 12:14:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/995444263188306398.


Modèle: ExtraTreesRegressor, R2: 0.9262, RMSE: 0.8243

Le meilleur modèle est 'RandomForestRegressor' avec R2 = 0.9263 et RMSE = 0.8188


Successfully registered model 'Best_Model'.
2024/08/07 12:14:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best_Model, version 1
Created version '1' of model 'Best_Model'.
2024/08/07 12:14:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run Best_Model_Production at: http://localhost:5000/#/experiments/995444263188306398/runs/c54223f5e2cd4eeaafa34bdd9b77f31a.
2024/08/07 12:14:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/995444263188306398.


Le modèle 'RandomForestRegressor' a été enregistré et mis en production.


---

## Validation du Modèle
Vérifions que le modèle enregistré fonctionne correctement avant de le déployer.

In [12]:
import json
import pandas as pd
import numpy as np
import mlflow
from mlflow.tracking import MlflowClient

# Define MLflow tracking URI
mlflow.set_tracking_uri("http://localhost:5000")

def get_model_uri_from_stage(model_name, stage="Production"):
    client = MlflowClient()
    # Get the latest version of the model in the specified stage
    model_versions = client.get_latest_versions(model_name, stages=[stage])
    if not model_versions:
        raise ValueError(f"No version of model '{model_name}' found in stage '{stage}'.")
    
    # Get the model URI
    latest_version = model_versions[0].version
    model_uri = f"models:/{model_name}/{latest_version}"
    return model_uri

def convert_input_example_to_serving_input(input_example):
    # Convert DataFrame to NumPy array
    if isinstance(input_example, pd.DataFrame):
        input_example = input_example.values
    
    # Return as a list of lists (2D array) which is often expected by models
    return input_example.tolist()

def validate_serving_input(model_uri, serving_payload):
    # Load the model
    model = mlflow.pyfunc.load_model(model_uri)
    
    # Make a prediction
    prediction = model.predict(serving_payload)
    
    # Print the prediction
    prediction_original = np.exp(prediction)
    passengers = prediction_original[0][0]
    freight = prediction_original[0][1]
    mail = prediction_original[0][2]

    print(f"Le nombre de passagers prévu est : {math.floor(passengers)}")
    print(f"La quantité de Fret prévue : {math.floor(freight)}")
    print(f"La quantité de Courrier prévue est : {math.floor(mail)}")


# Example input data
INPUT_EXAMPLE = {
    "Log_DISTANCE": 7.723120,
    "REGION": 1,
    "CARRIER_NAME": 7,
    "CARRIER_GROUP_NEW": 2,
    "ORIGIN": 326,
    "ORIGIN_WAC": 48,
    "DEST": 412,
    "DEST_CITY_NAME": 351,
    "DEST_WAC": 58,
    "YEAR": 2,
    "MONTH": 6,
    "DISTANCE_GROUP": 4,
    "CLASS": 0
}


# Convert input example to DataFrame
input_df = pd.DataFrame([INPUT_EXAMPLE])

# Obtain the model URI
model_name = "Best_Model"  # Model name
model_uri = get_model_uri_from_stage(model_name, stage="Production")
print(f"Model URI in production: {model_uri}")

# Convert the input example to a serving input format
serving_payload = convert_input_example_to_serving_input(input_df)

# Validate the model with the example input
validate_serving_input(model_uri, serving_payload)


Model URI in production: models:/Best_Model/1


Downloading artifacts: 100%|██████████| 5/5 [00:46<00:00,  9.35s/it]


Le nombre de passagers prévu est : 50
La quantité de Fret prévue : 1
La quantité de Courrier prévue est : 1


In [13]:
end_time = datetime.datetime.now()

print(end_time)

2024-08-07 12:15:32.396282


In [14]:
temps_total_pour_execution = start_time - end_time

print(temps_total_pour_execution)

-1 day, 23:26:40.867521
