In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
dataset = pd.read_csv('Inputdata.csv')
dataset.head()
X = dataset.iloc[:,[3,4,5,6,7,8,9,10,11,12,13,14]].values
y = dataset.iloc[:,[2]].values 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  # 70% train, 30% remaining
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)  # Split remaining 30% equally

# Apply Standard Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Fitness function for GA
def fitness_function(params, X_train, y_train, X_val, y_val):
    n_estimators = int(params[0])
    max_depth = int(params[1])
    max_features = int(params[2])

    model = RandomForestRegressor(n_estimators=n_estimators, 
                                  max_depth=max_depth, 
                                  max_features=max_features, 
                                  random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)

    return mean_squared_error(y_val, predictions)

# Initialize population
def initialize_population(pop_size, dim, lb, ub):
    lb = np.array(lb)  # Convert lb to a numpy array
    ub = np.array(ub)  # Convert ub to a numpy array
    return np.random.rand(pop_size, dim) * (ub - lb) + lb

# Select parents based on fitness
def select_parents(population, fitness, num_parents):
    parents_idx = np.argsort(fitness)[:num_parents]
    return population[parents_idx, :]

# Perform crossover between parents
def crossover(parents, offspring_size):
    offspring = np.zeros(offspring_size)
    num_parents = parents.shape[0]
    for k in range(offspring_size[0]):
        parent1_idx = k % num_parents
        parent2_idx = (k + 1) % num_parents
        crossover_point = np.random.randint(1, offspring_size[1])
        offspring[k, :crossover_point] = parents[parent1_idx, :crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

# Perform mutation
def mutation(offspring, lb, ub, mutation_rate=0.2):
    for i in range(offspring.shape[0]):
        if np.random.rand() < mutation_rate:
            gene_idx = np.random.randint(offspring.shape[1])
            random_value = np.random.rand() * (ub[gene_idx] - lb[gene_idx]) + lb[gene_idx]
            offspring[i, gene_idx] = random_value
    return offspring

# Genetic Algorithm for optimization
def genetic_algorithm(fobj, X_train, y_train, X_val, y_val, lb, ub, dim, pop_size, num_generations, num_parents_mating):
    population = initialize_population(pop_size, dim, lb, ub)
    best_solution = None
    best_fitness = float('inf')
    convergence_curve = []

    for generation in range(num_generations):
        fitness = np.array([fobj(ind, X_train, y_train, X_val, y_val) for ind in population])
        best_idx = np.argmin(fitness)
        
        if fitness[best_idx] < best_fitness:
            best_fitness = fitness[best_idx]
            best_solution = population[best_idx, :]

        convergence_curve.append(best_fitness)
        parents = select_parents(population, fitness, num_parents_mating)
        offspring_crossover = crossover(parents, (pop_size - parents.shape[0], dim))
        offspring_mutation = mutation(offspring_crossover, lb, ub)
        population[:parents.shape[0], :] = parents
        population[parents.shape[0]:, :] = offspring_mutation

    return best_solution, best_fitness, convergence_curve

# Define hyperparameter bounds
lb = [100, 1, 1]  # Lower bounds for n_estimators, max_depth, max_features
ub = [200, 20, 10]  # Upper bounds for n_estimators, max_depth, max_features
dim = 3  # Number of hyperparameters
pop_size = 40
num_generations = 100
num_parents_mating = 20

# Run GA optimization
best_solution, best_fitness, convergence_curve = genetic_algorithm(
    fitness_function, X_train, y_train, X_val, y_val, lb, ub, dim, pop_size, num_generations, num_parents_mating
)

print(f"Optimized hyperparameters: n_estimators={int(best_solution[0])}, max_depth={int(best_solution[1])}, max_features={int(best_solution[2])}")
print(f"Minimum validation error: {best_fitness}")

# Train the best RandomForestRegressor on the full training set
best_regressor = RandomForestRegressor(n_estimators=int(best_solution[0]), 
                                       max_depth=int(best_solution[1]), 
                                       max_features=int(best_solution[2]), 
                                       random_state=42)
best_regressor.fit(X_train, y_train)

# Evaluate RandomForest performance on the test set
test_score = best_regressor.score(X_test, y_test)
print(f"R-squared on test set with best parameters: {test_score}")