In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tqdm import tqdm

class WaterQualityOptimizer:
    def __init__(self, filepath):
        self.filepath = filepath
        self.X, self.y = self.load_data()
        self.attributes = self.X.columns.tolist()
        self.best_solution = None
        self.best_score = -np.inf
    
    def load_data(self):
        """Charge et prépare les données brutes (avec valeurs manquantes)"""
        data = pd.read_csv(self.filepath)
        X = data.drop('Potability', axis=1)
        y = data['Potability']
        return X, y
    
    def impute_data(self, strategy_params):
        """Impute les données selon la stratégie spécifiée"""
        X_imputed = self.X.copy()
        
        strategy = strategy_params['strategy']
        fill_value = strategy_params.get('fill_value')
        
        if strategy == 'mean':
            imputer = SimpleImputer(strategy='mean')
        elif strategy == 'median':
            imputer = SimpleImputer(strategy='median')
        elif strategy == 'constant':
            imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
        else:
            imputer = SimpleImputer(strategy='most_frequent')
        
        X_imputed = pd.DataFrame(imputer.fit_transform(X_imputed), 
                               columns=X_imputed.columns)
        
        scaler = StandardScaler()
        X_imputed = pd.DataFrame(scaler.fit_transform(X_imputed), 
                               columns=X_imputed.columns)
        
        return X_imputed
    
    def evaluate_solution(self, imputation_params, feature_subset):
        """Évalue une solution complète"""
        try:
            X_imputed = self.impute_data(imputation_params)
            
            if len(feature_subset) == 0:
                return 0.0
                
            X_subset = X_imputed.iloc[:, feature_subset]
            
            X_train, X_test, y_train, y_test = train_test_split(
                X_subset, self.y, test_size=0.3, random_state=42)
            
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(X_train, y_train)
            return knn.score(X_test, y_test)
        except:
            return 0.0
    
    def SCA_generate_solutions(self, num_solutions=5):
        """Génère des solutions d'imputation avec SCA"""
        strategies = ['mean', 'median', 'most_frequent', 'constant']
        solutions = []
        
        for _ in range(num_solutions):
            strategy = np.random.choice(strategies)
            fill_value = np.random.uniform(self.X.min().min(), self.X.max().max()) if strategy == 'constant' else None
            
            solutions.append({
                'strategy': strategy,
                'fill_value': fill_value
            })
        
        return solutions
    
    def GWO_feature_selection(self, X_imputed, num_wolves=10, max_iter=30):
        """Effectue la sélection d'attributs avec GWO (version corrigée)"""
        num_features = X_imputed.shape[1]
        wolves = np.random.rand(num_wolves, num_features) > 0.5
        
        alpha = wolves[0].copy()
        alpha_fitness = -np.inf
        beta = wolves[1].copy()
        beta_fitness = -np.inf
        delta = wolves[2].copy()
        delta_fitness = -np.inf
        
        for _ in range(max_iter):
            a = 2 - _ * (2 / max_iter)
            
            for i in range(num_wolves):
                selected = np.where(wolves[i])[0]
                fitness = self.evaluate_solution({'strategy': 'mean'}, selected)
                
                if fitness > alpha_fitness:
                    delta = beta.copy()
                    delta_fitness = beta_fitness
                    beta = alpha.copy()
                    beta_fitness = alpha_fitness
                    alpha = wolves[i].copy()
                    alpha_fitness = fitness
                elif fitness > beta_fitness:
                    delta = beta.copy()
                    delta_fitness = beta_fitness
                    beta = wolves[i].copy()
                    beta_fitness = fitness
                elif fitness > delta_fitness:
                    delta = wolves[i].copy()
                    delta_fitness = fitness
            
            for i in range(num_wolves):
                for j in range(num_features):
                    # Mise à jour des positions
                    A1 = 2 * a * np.random.rand() - a
                    C1 = 2 * np.random.rand()
                    D_alpha = abs(C1 * alpha[j] - wolves[i,j])
                    X1 = alpha[j] - A1 * D_alpha
                    
                    A2 = 2 * a * np.random.rand() - a
                    C2 = 2 * np.random.rand()
                    D_beta = abs(C2 * beta[j] - wolves[i,j])
                    X2 = beta[j] - A2 * D_beta
                    
                    A3 = 2 * a * np.random.rand() - a
                    C3 = 2 * np.random.rand()
                    D_delta = abs(C3 * delta[j] - wolves[i,j])
                    X3 = delta[j] - A3 * D_delta
                    
                    wolves[i,j] = (X1 + X2 + X3) / 3
                
                # Conversion en binaire (version corrigée)
                wolves[i] = (1 / (1 + np.exp(-wolves[i].astype(float)))) > 0.5
        
        best_subset = np.where(alpha)[0]
        return best_subset, alpha_fitness
    
    def optimize(self, num_iterations=10, num_sca_solutions=5, num_gwo_wolves=10, gwo_iterations=20):
        """Processus d'optimisation complet"""
        for iteration in tqdm(range(num_iterations), desc="Optimisation"):
            imputation_solutions = self.SCA_generate_solutions(num_sca_solutions)
            
            for imp_sol in imputation_solutions:
                X_imputed = self.impute_data(imp_sol)
                feature_subset, score = self.GWO_feature_selection(
                    X_imputed, num_gwo_wolves, gwo_iterations)
                
                final_score = self.evaluate_solution(imp_sol, feature_subset)
                
                if final_score > self.best_score:
                    self.best_score = final_score
                    self.best_solution = {
                        'imputation': imp_sol,
                        'features': feature_subset,
                        'score': final_score
                    }
        
        print("\nOptimisation terminée!")
        print(f"Meilleur score: {self.best_score:.4f}")
        print(f"Imputation: {self.best_solution['imputation']}")
        print(f"Attributs: {[self.attributes[i] for i in self.best_solution['features']]}")
        
        return self.best_solution
    
    def evaluate_final_model(self):
        """Évalue le modèle final"""
        if not self.best_solution:
            print("Exécutez d'abord optimize()")
            return None
        
        X_imputed = self.impute_data(self.best_solution['imputation'])
        X_best = X_imputed.iloc[:, self.best_solution['features']]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X_best, self.y, test_size=0.3, random_state=42)
        
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(X_train, y_train)
        
        print("\nPerformance finale:")
        print(f"Train accuracy: {knn.score(X_train, y_train):.4f}")
        print(f"Test accuracy: {knn.score(X_test, y_test):.4f}")
        
        return knn

# Exemple d'utilisation
if __name__ == "__main__":
    optimizer = WaterQualityOptimizer('water_potability.csv')
    best_solution = optimizer.optimize(
        num_iterations=9,
        num_sca_solutions=7,
        num_gwo_wolves=8,
        gwo_iterations=15
    )
    final_model = optimizer.evaluate_final_model()

Optimisation: 100%|██████████████████████████████████████████████████████████████████████| 9/9 [11:39<00:00, 77.73s/it]



Optimisation terminée!
Meilleur score: 0.6765
Imputation: {'strategy': 'mean', 'fill_value': None}
Attributs: ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Trihalomethanes', 'Turbidity']

Performance finale:
Train accuracy: 0.7532
Test accuracy: 0.6765
