In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
# Load the breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

In [3]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define a function to evaluate the fitness of a feature subset
def fitness(features):
    X_train_subset = X_train[:, features]
    X_test_subset = X_test[:, features]
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_subset, y_train)
    return clf.score(X_test_subset, y_test)

In [5]:
# Define the genetic algorithm parameters
population_size = 100
num_generations = 20
mutation_rate = 0.05

In [6]:
# Initialize the population randomly
population = np.random.randint(2, size=(population_size, X.shape[1]))

In [7]:
# Evaluate the fitness of the initial population
fitness_scores = [fitness(features) for features in population]

In [8]:
# Iterate through the generations
for generation in range(num_generations):
    # Select the parents for crossover
    parent_indices = np.random.choice(population_size, size=population_size, replace=True, p=fitness_scores/np.sum(fitness_scores))
    parents = population[parent_indices]

    # Perform crossover to create the children
    children = np.empty_like(parents)
    for i in range(population_size):
        parent1 = parents[i]
        parent2 = parents[np.random.randint(population_size)]
        crossover_point = np.random.randint(1, X.shape[1]-1)
        child = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
        children[i] = child

    # Perform mutation on some of the children
    for i in range(population_size):
        if np.random.rand() < mutation_rate:
            mutation_point = np.random.randint(X.shape[1])
            children[i, mutation_point] = 1 - children[i, mutation_point]

    # Evaluate the fitness of the children
    children_fitness_scores = [fitness(features) for features in children]

    # Replace the old population with the new population of children
    population = children
    fitness_scores = children_fitness_scores

In [10]:
# Select the best feature subset from the final population
best_features = population[np.argmax(fitness_scores)]

In [11]:
# Train and evaluate a classifier using the selected features
X_train_subset = X_train[:, best_features.astype(bool)]
X_test_subset = X_test[:, best_features.astype(bool)]
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_subset, y_train)
accuracy = clf.score(X_test_subset, y_test)

In [12]:
print("Selected features: ", best_features)
print("Accuracy: ", accuracy)

Selected features:  [0 0 0 0 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1]
Accuracy:  0.9649122807017544
