# Librairy

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn

ModuleNotFoundError: No module named 'tensorflow'

# Data

In [None]:
df = pd.read_csv('../data/Titanic.csv')
df

# Preprocessing

In [None]:
features = df[['sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'alone']]
target = df['survived']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Pipeline pour les variables numériques
numeric_features = ['age', 'sibsp', 'parch', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline pour les variables catégorielles
categorical_features = ['sex', 'embarked', 'class', 'who', 'alone']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combinaison des transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model 

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC()
}

In [None]:
param_grids = {
    'LogisticRegression': {
        'classifier__C': [0.01, 0.1, 1, 10, 100]
    },
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30]
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': [1, 0.1, 0.01],
        'classifier__kernel': ['rbf', 'poly', 'sigmoid']
    }
}

In [None]:
mlflow.set_tracking_uri('azureml://<workspace_url>')
mlflow.set_experiment('Titanic_Survival_Prediction_Comparison')

In [None]:
best_models = {}
results = []

In [None]:
for model_name in models:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', models[model_name])
    ])
    
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best cross-validation accuracy for {model_name}: {best_score}")

    # Évaluer les meilleurs modèles sur le jeu de test
    y_pred = best_models[model_name].predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {model_name}: {test_accuracy}")
    print(f"Classification report for {model_name}:\n {classification_report(y_test, y_pred)}")
    
    # Enregistrer les résultats
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Cross-validation Accuracy': best_score,
        'Test Accuracy': test_accuracy
    })
    
    # Utilisation de MLFlow pour suivre les modèles
    with mlflow.start_run(run_name=model_name):
        mlflow.sklearn.log_model(best_models[model_name], model_name)
        mlflow.log_params(best_params)
        mlflow.log_metric('cv_accuracy', best_score)
        mlflow.log_metric('test_accuracy', test_accuracy)
        print(f"Logged {model_name} to MLFlow with test accuracy: {test_accuracy}")

results_df = pd.DataFrame(results)
results_df

# ML Flow configuration

In [None]:
mlflow.set_tracking_uri('azureml://<workspace_url>')
mlflow.set_experiment('Titanic_Survival_Prediction')

# Démarrer une nouvelle exécution MLFlow
with mlflow.start_run():
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))
    mlflow.tensorflow.log_model(model, 'model')
    loss, accuracy = model.evaluate(X_test, y_test)
    mlflow.log_metric('loss', loss)
    mlflow.log_metric('accuracy', accuracy)

# Azure deploiement

In [None]:
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

# Se connecter à l'espace de travail Azure ML
ws = Workspace.from_config()

# Enregistrer le modèle dans Azure ML
model = Model.register(workspace=ws, model_name='titanic_survival_model', model_path='path/to/model')

# Créer l'environnement pour l'inférence
env = Environment(name='titanic-env')
python_packages = ['tensorflow', 'scikit-learn', 'pandas']
for package in python_packages:
    env.python.conda_dependencies.add_pip_package(package)

# Configurer l'inférence
inference_config = InferenceConfig(entry_script='score.py', environment=env)

# Configurer le déploiement
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Déployer le service web
service = Model.deploy(workspace=ws, name='titanic-survival-service', models=[model], inference_config=inference_config, deployment_config=aci_config)
service.wait_for_deployment(show_output=True)


# Scoring

In [None]:
import json
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from azureml.core.model import Model

def init():
    global model
    model_path = Model.get_model_path('titanic_survival_model')
    model = tf.keras.models.load_model(model_path)

def run(raw_data):
    data = np.array(json.loads(raw_data)['data'])
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    predictions = model.predict(data)
    return json.dumps(predictions.tolist())


In [None]:

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC()
}

param_grids = {
    'LogisticRegression': {
        'classifier__C': [0.01, 0.1, 1, 10, 100]
    },
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30]
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': [1, 0.1, 0.01],
        'classifier__kernel': ['rbf', 'poly', 'sigmoid']
    }
}

# Définir l'URI de suivi MLFlow
mlflow.set_tracking_uri('azureml://<workspace_url>')
mlflow.set_experiment('Titanic_Survival_Prediction_Comparison')

# Itérer sur les modèles et effectuer GridSearchCV
best_models = {}
results = []

for model_name in models:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', models[model_name])
    ])
    
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best cross-validation accuracy for {model_name}: {best_score}")

    # Évaluer les meilleurs modèles sur le jeu de test
    y_pred = best_models[model_name].predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {model_name}: {test_accuracy}")
    print(f"Classification report for {model_name}:\n {classification_report(y_test, y_pred)}")
    
    # Enregistrer les résultats
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Cross-validation Accuracy': best_score,
        'Test Accuracy': test_accuracy
    })
    
    # Utilisation de MLFlow pour suivre les modèles
    with mlflow.start_run(run_name=model_name):
        mlflow.sklearn.log_model(best_models[model_name], model_name)
        mlflow.log_params(best_params)
        mlflow.log_metric('cv_accuracy', best_score)
        mlflow.log_metric('test_accuracy', test_accuracy)
        print(f"Logged {model_name} to MLFlow with test accuracy: {test_accuracy}")

# Afficher les résultats détaillés
results_df = pd.DataFrame(results)
print(results_df)
