In [1]:
import pickle
import os 
from itertools import product

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

from pycgp import single_mutation, point_mutation, probabilistic_mutation
from pycgp.gems import MatchByActiveStrategy, MatchPMStrategy, MatchSMStrategy

from utils import load_data, aggregate_statistics, plot_fitnesses, plot_distributions

This notebook interpret the results of basic run of symbolic regression - 30 applications of gems until it expires, replacement of the worst gem in jewellery box, 5 individuals in population.

# Aggregate statistics

In [2]:
mutations = [
        (single_mutation, MatchSMStrategy),
        (point_mutation, MatchPMStrategy),
        (probabilistic_mutation, MatchSMStrategy),
        (single_mutation, MatchByActiveStrategy),
        (probabilistic_mutation, MatchByActiveStrategy)]

folder = 'scripts/santa_fe_out'
data = aggregate_statistics(folder, mutations)
data.head()

Unnamed: 0,mutation,strategy,gems,columns,best,mean,std,avg_gem_count,gem_better,gem_worse,bf,mf
0,single_mutation,MatchSMStrategy,0,10,-27.8,-17.21,9.078754,0.0,0.0,0.0,"[-1.75, -1.95, -2.15, -2.4, -2.5, -2.9, -2.95,...","[-0.61, -1.21, -1.4999999999999998, -1.5999999..."
1,single_mutation,MatchSMStrategy,0,50,-69.05,-57.6,-29.549113,0.0,0.0,0.0,"[-3.05, -3.6, -4.2, -4.5, -5.25, -5.5, -5.8, -...","[-1.0799999999999998, -2.6899999999999995, -3...."
2,single_mutation,MatchSMStrategy,0,100,-67.45,-57.66,-15.636148,0.0,0.0,0.0,"[-3.4, -3.9, -4.45, -4.9, -5.3, -5.85, -6.0, -...","[-1.21, -3.06, -3.13, -3.8300000000000005, -4...."
3,single_mutation,MatchSMStrategy,5,10,-37.3,-22.73,-21.457154,6.05,93.35,25.0,"[-1.5, -1.75, -2.05, -2.2, -2.5, -2.75, -2.9, ...","[-0.5599999999999999, -1.2400000000000002, -1...."
4,single_mutation,MatchSMStrategy,5,50,-75.2,-60.0,-58.388219,5.3,27.95,19.2,"[-2.85, -3.8, -4.05, -4.15, -4.65, -5.35, -5.7...","[-0.9399999999999998, -2.9699999999999998, -2...."


In [3]:
data.best = pd.to_numeric(data.best)
data.groupby('gems').mean()

Unnamed: 0_level_0,best,mean,std,avg_gem_count,gem_better,gem_worse
gems,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-53.606667,-33.517333,-19.70035,0.0,0.0,0.0
5,-56.003333,-34.855333,-33.262479,6.226667,38.193333,67.66
10,-55.61,-34.954667,-32.652012,10.98,60.646667,87.643333


Overall, gems brough slight deterioration of the results. Though, standard deviaton of generation containing the best solution is slightly better, mean of that generation is lower.

Let's have a look at performance according to mutation and match strategy used.

In [None]:
data.groupby(['mutation', 'strategy']).mean()

Best performance was achieved by point mutation with its simple strategy. RIght after that is single mutation, with match by all startegy. Let's have a look at point mutation.

In [None]:
data[(data['mutation'] == 'point_mutation')].iloc[:,:-2]

With exception of point mutation with 5 max gems and 10 columns, all runs were able to achieve optimal solution. In terms of stability of results, best result was achieved using 10 gems on largest graph (100 columns), with highest mean (88.2) and lowest deviation (0.44).

Interesting observation is worse mean after using gems on small graphs, indicating, that gems are usefull only on large graphs (in this case) - both 5 gem limit and 10 gem limit were more sucessful only on 50 and 100 column long individual.

Let's have a look at worse performing mutation and effect of gems in this case:

In [None]:
data[(data['mutation'] == 'probabilistic_mutation') & (data['strategy'] == 'MatchByActiveStrategy' )].iloc[:,:-2]

Here we can observe generally worse performance when using gems. Overall it seems, that large perturbance is not suitable for this task.

Let's see the avereage performance, when it comes mutation type, without using gems:

In [None]:
data[data.gems == 0].groupby('mutation').mean()

Best performance was achieved by single mutation, now let's have a look at average including gems:

In [None]:
data[data.gems != 0].groupby('mutation').mean()

Here, point mutation is achieving best performance, and other two types suffer from slight deterioration.

Let's plot the average best fitness of probabilistic and single mutations.

In [None]:
plot_fitnesses(data, 'probabilistic_mutation', (0, -89))

In [None]:
plot_fitnesses(data, 'single_mutation', (0, -89))

In [None]:
plot_fitnesses(data, 'point_mutation', (0, -89))

In [None]:
plot_distributions(folder, mutations)

# Conclusion

Gems were able to improve performance in almost all instances of experiment. Best performance overall was achieved by 