In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the ovarian cancer dataset (replace 'OC.csv' with your file path)
dataset = pd.read_csv('OC.csv')

# Assuming the 'M/Z' and 'Intensity' columns are features and the last column is the target
X = dataset[['M/Z', 'Intensity']].values
y = dataset.iloc[:, -1].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the genetic algorithm functions
def initialize_population(population_size, n_features):
    return np.random.randint(2, size=(population_size, n_features))

def fitness(population, X_train, X_test, y_train, y_test):
    errors = []
    for chromosome in population:
        selected_features = [bool(bit) for bit in chromosome]

        # Ensure at least one feature is selected
        if not any(selected_features):
            selected_features[np.random.randint(len(selected_features))] = True

        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]

        # Train a regressor (Random Forest Regressor in this case)
        clf = RandomForestRegressor(n_estimators=100, random_state=42)
        clf.fit(X_train_selected, y_train)

        # Evaluate the model
        y_pred = clf.predict(X_test_selected)
        error = mean_squared_error(y_test, y_pred)
        errors.append(error)

    return np.array(errors)

# Genetic Algorithm Parameters
population_size = 10
n_features = X_train.shape[1]
n_generations = 20
mutation_rate = 0.1

# Initialize population
population = initialize_population(population_size, n_features)

# Evolutionary loop
for generation in range(n_generations):
    # Evaluate fitness
    fitness_scores = fitness(population, X_train, X_test, y_train, y_test)

    # Select parents
    parents = select_parents(population, fitness_scores)

    # Create offspring through crossover
    offspring = [crossover(parents[0], parents[1]) for _ in range(population_size - 2)]

    # Mutate offspring
    offspring = [mutate(child, mutation_rate) for child in offspring]

    # Create next generation
    population[2:] = offspring

# Select the best individual from the final population
best_individual = population[np.argmin(fitness(population, X_train, X_test, y_train, y_test))]

# Train the final model using the best features
selected_features = [bool(bit) for bit in best_individual]
X_train_final = X_train[:, selected_features]
X_test_final = X_test[:, selected_features]

final_clf = RandomForestRegressor(n_estimators=100, random_state=42)
final_clf.fit(X_train_final, y_train)

# Evaluate the final model
y_pred_final = final_clf.predict(X_test_final)
final_error = mean_squared_error(y_test, y_pred_final)

print(f"Final Model Mean Squared Error: {final_error}")



Final Model Mean Squared Error: 0.000839805109560141
