# Challenge 2

## Imports

In [436]:
# Data Manipualtion
import numpy as np
import pandas as pd
pd.options.display.float_format = "{:,.3f}".format
from collections import defaultdict
from itertools import combinations

# Auxilary
from tqdm import tqdm

# Machine Learning
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Genetic Algorithm
import pygad

## Load Data

In [437]:
# Load Data
df = pd.read_csv('dataset_B.csv')

# Split
X = df.drop(columns=['fermentation_time', 'coagulation_quality'])
fermentation_time = df['fermentation_time']
coagulation_quality = df['coagulation_quality']

# Scale data
mms = defaultdict(lambda: MinMaxScaler())
X = pd.DataFrame(mms['X'].fit_transform(X), columns=X.columns)
fermentation_time = mms['ft'].fit_transform(fermentation_time.to_numpy().reshape(-1, 1))
coagulation_quality = mms['cq'].fit_transform(coagulation_quality.to_numpy().reshape(-1, 1))

## Machine Learning

**Fermentation Time**

In [438]:
%%time
# Pipeline to standardize then run the classifier
R_ft =  Pipeline([("scaler", MinMaxScaler()),
                  ("rf", GradientBoostingRegressor())])

# Grid with parameters to be tested via CV
R_ft_param_grid_ = {'rf__max_depth': [3, 4, 5, 6],
                    'rf__min_samples_leaf': [1, 2, 3, 4],
                    'rf__ccp_alpha': np.logspace(-3, 0, 4)}

# Instantiate GridSearchCV using accuracy as the scorer
R_ft_gridCV = GridSearchCV(R_ft, R_ft_param_grid_, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')

# Run GridSearchCV
R_ft_gridCV = R_ft_gridCV.fit(X, fermentation_time.ravel())

Wall time: 10.8 s


**Coagulation Quality**

In [439]:
%%time
# Pipeline to standardize then run the classifier
R_cq =  Pipeline([("scaler", MinMaxScaler()),
                  ("rf", GradientBoostingRegressor())])

# Grid with parameters to be tested via CV
R_cq_param_grid_ = {'rf__max_depth': [3, 4, 5, 6],
                    'rf__min_samples_leaf': [1, 2, 3, 4],
                    'rf__ccp_alpha': np.logspace(-3, 0, 4)}

# Instantiate GridSearchCV using accuracy as the scorer
R_cq_gridCV = GridSearchCV(R_cq, R_cq_param_grid_, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')

# Run GridSearchCV
R_cq_gridCV = R_cq_gridCV.fit(X, coagulation_quality.ravel())

Wall time: 4.26 s


## Optimize Genetic Algorithm Hyperparameters

In [440]:
_parent_selection_type = ['sss', 'sus', 'tournament']
_crossover_type = ['single_point', 'scattered']
_crossover_probability = [0.33, 0.66]
_mutation_type = ['random', 'swap']
_mutation_probability = [0.2, 0.5, 0.8]

In [None]:
best_params = defaultdict(lambda: None)
min_error = float('inf')

for _ct in _crossover_type:
    for _cp in _crossover_probability:

        error = 0

        # Test parameter combination on all test data
        for idx, row in df.iterrows():
            # Targets
            target_ft = row['fermentation_time']
            target_cq = row['coagulation_quality']

            # Scale
            scaled_target_ft = mms['ft'].transform([[target_ft]])[0][0]
            scaled_target_cq = mms['cq'].transform([[target_cq]])[0][0]

            def fitness_func(solution, solution_idx):
                # Fermentation time
                pred_ft = R_ft_gridCV.predict([solution])[0]
                ft_error = -(scaled_target_ft - pred_ft)**2 # negative squared error for maximization problem
                # Coagulation Quality
                pred_cq = R_cq_gridCV.predict([solution])[0]
                cq_error = -(scaled_target_cq - pred_cq)**2 # negative squared error for maximization problem
                return ft_error + cq_error

            # Genetic Algorithm
            ga_instance = pygad.GA(num_generations = 200,
                                   num_parents_mating = 5,
                                   fitness_func = fitness_func,
                                   sol_per_pop = 10,
                                   num_genes = X.shape[1],
                                   init_range_low = 0.0,
                                   init_range_high = 1.0,
                                   gene_space = {'low': 0, 'high': 1},
                                   parent_selection_type = 'sss',
                                   keep_parents = 1,
                                   crossover_type = _ct,
                                   crossover_probability = _cp,
                                   mutation_type = 'adaptive',
                                   mutation_probability = (0.8, 0.2),
                                   save_best_solutions = True,
                                   suppress_warnings = True)
            ga_instance.run()

            # Get solution and predicted values
            last_generation = ga_instance.last_generation_offspring_mutation
            last_generation = np.concatenate([ga_instance.best_solution()[0].reshape(1, -1), last_generation])
            pred_fts = R_ft_gridCV.predict(last_generation)
            pred_cqs = R_ft_gridCV.predict(last_generation)

            # Error
            error += sum((pred_fts - scaled_target_ft)**2 + (pred_cqs - scaled_target_cq)**2)

        if error < min_error:
            best_params['_ct'] = _ct
            best_params['_cp'] = _cp
            best_params['error'] = error
            min_error = error

        print(f'ct: {_ct} | cp: {_cp} | error: {error}')

ct: single_point | cp: 0.33 | error: 68.3944031062464
ct: single_point | cp: 0.66 | error: 61.591207011819314


## Genetic Algorithm

In [None]:
def recommend(target_ft, target_cq):
    # Transform inputs
    scaled_target_ft = mms['ft'].transform([[target_ft]])[0][0]
    scaled_target_cq = mms['cq'].transform([[target_cq]])[0][0]

    # Define fitness
    def fitness_func(solution, solution_idx):
        # Fermentation time
        pred_ft = R_ft_gridCV.predict([solution])[0]
        ft_error = -(scaled_target_ft - pred_ft)**2 # negative squared error for maximization problem
        # Coagulation Quality
        pred_cq = R_cq_gridCV.predict([solution])[0]
        cq_error = -(scaled_target_cq - pred_cq)**2 # negative squared error for maximization problem
        return ft_error + cq_error

    # Instantiate and run Genetic Algorithm
    ga_instance = pygad.GA(num_generations = 200,
                   num_parents_mating = 5,
                   fitness_func = fitness_func,
                   sol_per_pop = 10,
                   num_genes = X.shape[1],
                   init_range_low = 0.0,
                   init_range_high = 1.0,
                   gene_space = {'low':0, 'high':1},
                   parent_selection_type = 'sss',
                   keep_parents = 1,
                   crossover_type = best_params['_ct'],
                   crossover_probability = best_params['_cp'],
                   mutation_type = 'adaptive',
                   mutation_probability = (0.8, 0.2),
                   save_best_solutions = True,
                   suppress_warnings = True)
    ga_instance.run()

    # Get solution and predicted values
    last_generation = ga_instance.last_generation_offspring_mutation
    last_generation = np.concatenate([ga_instance.best_solution()[0].reshape(1, -1), last_generation])
    pred_fts = R_ft_gridCV.predict(last_generation)
    pred_cqs = R_ft_gridCV.predict(last_generation)

    # Convert back to original scale
    denormalized_Xs = mms['X'].inverse_transform(last_generation)
    denormalized_fts = mms['ft'].inverse_transform([pred_fts])
    denormalized_cqs = mms['cq'].inverse_transform([pred_cqs])

    # Merge 
    df_X = pd.DataFrame(denormalized_Xs)
    df_ft = pd.DataFrame(denormalized_fts).T
    df_cq = pd.DataFrame(denormalized_cqs).T
    output = pd.concat([df_X, df_ft, df_cq], axis='columns')
    output.columns = df.columns

    return output

## Usage

In [None]:
target_ft = 155
target_cq = 1.9

recommend(target_ft, target_cq)

In [None]:
ga_instance.