In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def load_and_process_data(data1, data2, data3, data4, common_key, target_column):
    """Charge, fusionne et nettoie les fichiers CSV."""
    caract = pd.read_csv(data1, sep=";", low_memory=False)
    lieux = pd.read_csv(data2, sep=";", low_memory=False)
    usagers = pd.read_csv(data3, sep=";", low_memory=False)
    vehicules = pd.read_csv(data4, sep=";", low_memory=False)
    
    merged_df = caract.merge(lieux, on=common_key, how="inner") \
                      .merge(usagers, on=common_key, how="inner") \
                      .merge(vehicules, on=common_key, how="inner")
    
    print(f"Colonnes disponibles après fusion : {merged_df.columns.tolist()}")

    merged_df = merged_df.dropna(axis=1, how='all')
    merged_df = merged_df.loc[:, merged_df.nunique() > 1]
    
    y = merged_df[target_column]
    X = merged_df.drop(columns=[target_column, common_key])
    return X, y

def correlation_matrix(X, y, threshold=0.1):
    """Filtre les colonnes numériques basées sur leur corrélation avec la cible."""
    correlations = {}
    for col in X.select_dtypes(include=[np.number]).columns:
        corr = np.corrcoef(X[col], y)[0, 1]
        correlations[col] = corr
    selected = [col for col, corr in correlations.items() if abs(corr) > threshold]
    print(f"Colonnes numériques retenues (corrélation > {threshold}): {selected}")
    return selected

def categorical_analysis(X, y, threshold=0.2):
    """Filtre les colonnes catégorielles en se basant sur la variance des moyennes."""
    X_with_target = X.copy()
    X_with_target['target'] = y

    selected = []
    for col in X.select_dtypes(include='object').columns:
        means = X_with_target.groupby(col)['target'].mean()
        if means.var() > threshold:
            selected.append(col)
    print(f"Colonnes catégorielles retenues (variance > {threshold}): {selected}")
    return selected

def low_variance_filter(X, threshold=0.01):
    """Filtre les colonnes numériques avec une faible variance."""
    X_numeric = X.select_dtypes(include=[np.number])
    variances = X_numeric.var()
    selected = variances[variances > threshold].index.tolist()
    print(f"Colonnes numériques retenues (variance > {threshold}): {selected}")
    return selected


def auto_handle_nan(df, nan_threshold_delete=0.5, nan_threshold_impute=0.1):
    """
    Traite automatiquement les valeurs NaN dans un dataset.
    - Supprime les colonnes avec trop de NaN.
    - Impute (remplace) les NaN avec des stratégies adaptées :
      - Moyenne pour colonnes numériques.
      - Mode ou "Inconnu" pour colonnes catégorielles.
    """
    print("Analyse des NaN dans le dataset...\n")
    
    nan_percent = df.isnull().mean()
    print("Pourcentage de valeurs manquantes par colonne :")
    print(nan_percent)
    
    cols_to_delete = nan_percent[nan_percent > nan_threshold_delete].index
    print(f"\nColonnes supprimées (trop de NaN > {nan_threshold_delete*100}%): {list(cols_to_delete)}")
    df = df.drop(columns=cols_to_delete)
    
    for col in df.columns:
        missing = df[col].isnull().sum()
        if missing > 0:
            if df[col].dtype == 'object':
                if nan_percent[col] > nan_threshold_impute:
                    print(f"Colonne '{col}' : Imputation avec 'Manquant' (catégorielle)")
                    df[col] = df[col].fillna("Manquant")
                else:
                    print(f"Colonne '{col}' : Imputation avec la valeur la plus fréquente (mode)")
                    df[col] = df[col].fillna(df[col].mode()[0])
            else:
                if nan_percent[col] > nan_threshold_impute:
                    print(f"Colonne '{col}' : Imputation avec la médiane (numérique)")
                    df[col] = df[col].fillna(df[col].median())
                else:
                    print(f"Colonne '{col}' : Imputation avec la moyenne (numérique)")
                    df[col] = df[col].fillna(df[col].mean())
    
    print("\nTraitement des NaN terminé.")
    return df


In [3]:
def preprocess_data(data1, data2, data3, data4, common_key, target_column):
    print("Chargement et fusion des données...")
    X, y = load_and_process_data(data1, data2, data3, data4, common_key, target_column)

    print("\nSuppression des doublons...")
    print("Nombre de doublons avant suppression :", X.duplicated().sum())
    X = X.drop_duplicates()
    print("Nombre de doublons après suppression :", X.duplicated().sum())

    print("\nTraitement des valeurs NaN...")
    X = auto_handle_nan(X)

    print("\nSélection des colonnes numériques importantes...")
    numeric_cols_corr = correlation_matrix(X, y, threshold=0.1)
    numeric_cols_var = low_variance_filter(X, threshold=0.1)

    print("\nSélection des colonnes catégorielles importantes...")
    categorical_cols = categorical_analysis(X, y, threshold=2)

    selected_columns = list(set(numeric_cols_var + categorical_cols))
    print(f"\nColonnes finales sélectionnées : {selected_columns}")

    X_filtered = X[selected_columns]
    # final_data = pd.concat([X_filtered, y], axis=1)

    print("\nRésumé des colonnes importantes pour la prédiction :")
    print(f"Nombre de colonnes finales : {len(X_filtered.columns)}")
    print(X_filtered.head())
    return X_filtered, y

In [None]:
import optuna
import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01, epochs=500):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        self.weights_input_hidden = np.random.randn(self.input_size, self.hidden_size) * 0.1
        self.bias_hidden = np.zeros((1, self.hidden_size))
        self.weights_hidden_output = np.random.randn(self.hidden_size, self.output_size) * 0.1
        self.bias_output = np.zeros((1, self.output_size))
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def sigmoid_derivative(self, z):
        return z * (1 - z)
    
    def fit(self, X, y):
        for _ in range(self.epochs):
            hidden_layer_activation = np.dot(X, self.weights_input_hidden) + self.bias_hidden
            hidden_layer_output = self.sigmoid(hidden_layer_activation)
            final_layer_activation = np.dot(hidden_layer_output, self.weights_hidden_output) + self.bias_output
            output = self.sigmoid(final_layer_activation)
            
            error = y - output
            
            d_output = error * self.sigmoid_derivative(output)
            d_hidden_layer = np.dot(d_output, self.weights_hidden_output.T) * self.sigmoid_derivative(hidden_layer_output)
            
            self.weights_hidden_output += np.dot(hidden_layer_output.T, d_output) * self.learning_rate
            self.bias_output += np.sum(d_output, axis=0, keepdims=True) * self.learning_rate
            self.weights_input_hidden += np.dot(X.T, d_hidden_layer) * self.learning_rate
            self.bias_hidden += np.sum(d_hidden_layer, axis=0, keepdims=True) * self.learning_rate
    
    def predict(self, X):
        hidden_layer_activation = np.dot(X, self.weights_input_hidden) + self.bias_hidden
        hidden_layer_output = self.sigmoid(hidden_layer_activation)
        final_layer_activation = np.dot(hidden_layer_output, self.weights_hidden_output) + self.bias_output
        output = self.sigmoid(final_layer_activation)
        return np.round(output)

class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=500):
        self.learning_rate = learning_rate
        self.epochs = epochs
    
    def fit(self, X, y):
        self.weights = np.zeros(X.shape[1])
        self.bias = 0
        m = len(y)
        
        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            predictions = 1 / (1 + np.exp(-linear_model))
            
            dw = (1/m) * np.dot(X.T, (predictions - y))
            db = (1/m) * np.sum(predictions - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        predictions = 1 / (1 + np.exp(-linear_model))
        return np.round(predictions)


def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    split_idx = int(X.shape[0] * (1 - test_size))
    train_indices = indices[:split_idx]
    test_indices = indices[split_idx:]
    
    # Conversion en NumPy
    X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
    y = y.to_numpy() if isinstance(y, pd.Series) else y
    
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]


def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def optimize_model(trial, model_name, X_train, y_train, X_val, y_val):
    if model_name == "NeuralNetwork":
        learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
        hidden_size = trial.suggest_int("hidden_size", 5, 50)
        epochs = trial.suggest_int("epochs", 100, 1000)
        
        model = NeuralNetwork(input_size=X_train.shape[1], hidden_size=hidden_size, output_size=1, learning_rate=learning_rate, epochs=epochs)
    elif model_name == "LogisticRegression":
        learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
        epochs = trial.suggest_int("epochs", 100, 1000)
        
        model = LogisticRegression(learning_rate=learning_rate, epochs=epochs)
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return accuracy_score(y_val, predictions)

def auto_ml(X, y, models):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    best_model_name = None
    best_params = None
    best_score = 0
    final_model = None
    
    for model_name in models:
        def objective(trial):
            return optimize_model(trial, model_name, X_train, y_train, X_val, y_val)
        
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=30)  # 30 essais par modèle
        
        if study.best_value > best_score:
            best_model_name = model_name
            best_params = study.best_params
            best_score = study.best_value
            
            final_model = optimize_model(trial, model_name, X_train, y_train, X_val, y_val)
    
    return final_model, best_model_name, best_params, best_score

if __name__ == "__main__":
    X,y = preprocess_data("data/caract-2023.csv", "data/lieux-2023.csv", "data/usagers-2023.csv", "data/vehicules-2023.csv", "Num_Acc", "grav")
    models = ["NeuralNetwork", "LogisticRegression"]
    
    final_model, best_model_name, best_params, best_score = auto_ml(X, y, models)
    
    print("Meilleur modèle :", best_model_name)
    print("Meilleurs hyperparamètres :", best_params)
    print("Meilleure précision :", best_score)

  from .autonotebook import tqdm as notebook_tqdm


Chargement et fusion des données...
Colonnes disponibles après fusion : ['Num_Acc', 'jour', 'mois', 'an', 'hrmn', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr', 'lat', 'long', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ', 'vma', 'id_usager', 'id_vehicule_x', 'num_veh_x', 'place', 'catu', 'grav', 'sexe', 'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp', 'id_vehicule_y', 'num_veh_y', 'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'occutc']

Suppression des doublons...
Nombre de doublons avant suppression : 0
Nombre de doublons après suppression : 0

Traitement des valeurs NaN...
Analyse des NaN dans le dataset...

Pourcentage de valeurs manquantes par colonne :
jour             0.000000
mois             0.000000
hrmn             0.000000
lum              0.000000
dep              0.000000
com              0.000000
agg              0.000000
int              0.000000
atm

[I 2024-12-18 11:37:15,275] A new study created in memory with name: no-name-62c0527c-ee10-49f0-8873-1ad4ae631c97
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-1)
