# Gem stats analysis - symbolic regression

In [1]:
import os
import pickle
from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pycgp import probabilistic_mutation, point_mutation, single_mutation
from pycgp.gems import MatchByActiveStrategy, MatchSMStrategy, MatchPMStrategy


    

In [29]:
class DataIterator():
    def __init__(self, folder):
        self.mutations = [
          #  (probabilistic_mutation, MatchSMStrategy),
            (point_mutation, MatchPMStrategy),
            (single_mutation, MatchSMStrategy),
            (single_mutation, MatchByActiveStrategy),
            (probabilistic_mutation, MatchByActiveStrategy)
        ]
        self.gems = [5, 10]
        
        self.cols = [10, 50, 100]
        
        data = []
        for m,s,g,c,d in self.__iterate_folder(folder):
            data.append([m, s, g, c, [x['gem_data'] for x in d], np.mean([x['gem_better_after'] for x in d]), np.mean([x['gem_worse_after'] for x in d]), np.mean([x['gem_same_after'] for x in d])])
        data = pd.DataFrame(data)
        data.columns = ['m', 's', 'g', 'c', 'gem_data', 'better_after', 'worse_after', 'safe_after']
        self.data = data      
    
    def __iterate_folder(self,folder):
        for index, ((mutation, strategy), gem, column) in enumerate(product(self.mutations, self.gems, self.cols)):
            file = os.path.join(folder,  f'{mutation.__name__}-{strategy.__name__}-gems{gem}-n_cols{column}.csv')
            with open (file, 'rb') as fp:
                data = pickle.load(fp)

                yield mutation.__name__, strategy.__name__, gem, column, data
    
    def iterate_gem_data(self, mutation, strategy, axis=False):
        gdatas = self.data[(self.data.m == mutation) & (self.data.s == strategy) & (self.data.g != 0)]

        if axis:
            _, axs = plt.subplots(2, len(gdatas)//2, figsize=(8,6))

        for i, (_, gdata) in enumerate(gdatas.iterrows()):
            pgdata = []
            #pdb.set_trace()
            for gem in [item for sublist in gdata.gem_data for item in sublist]:
                row = [gdata.g, gdata.c, gem.match_checks, gem.match_count, gem.n_uses, gem.value, gem.match_probability, gdata.better_after, gdata.worse_after, gdata.safe_after]
                pgdata.append(row)

            pgdata = pd.DataFrame(pgdata)
            pgdata.columns = ['gems', 'columns', 'match_checks', 'match_count', 'n_uses', 'value', 'match_probability', 'better_after', 'worse_after', 'same_after']
            pgdata['success_rate_counts'] = pgdata.n_uses / pgdata.match_count
            pgdata['success_rate_checks'] = pgdata.n_uses / pgdata.match_checks
            
            if axis:
                yield pgdata, axs[i//3][i%3]
            else:
                yield pgdata, None
    
    def stats(self):
        frames = []
        for m, s in self.mutations:
            for pgdata, _ in self.iterate_gem_data(m.__name__, s.__name__):
                pgdata['m'] = m.__name__
                pgdata['s'] = s.__name__
                frames.append(pgdata)
        data = pd.concat(frames)
        return data

import pdb
            

symreg = DataIterator('scripts/symbolic_basic/')
bincls = DataIterator('scripts/bin_class_out/')
santaf = DataIterator('scripts/santa_fe_out/')

def densities(di, m, s):
    frames = []
    for pgdata, ax in di.iterate_gem_data(m, s, True):
        frames.append(pgdata)
        sns.distplot(pgdata.n_uses, ax=ax)
    data = pd.concat(frames)
    print(f'Count of gems: {len(data)}')
    print(data.groupby('gems').median())


In [30]:
symreg.stats().groupby(['m', 's']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,gems,columns,match_checks,match_count,n_uses,value,match_probability,better_after,worse_after,same_after,success_rate_counts,success_rate_checks
m,s,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
point_mutation,MatchPMStrategy,7.995846,55.981308,17033.717549,456.238837,18.500519,0.06337,7.258395e-24,127.97269,30.744496,5.60514,0.714222,0.015125
probabilistic_mutation,MatchByActiveStrategy,7.646771,53.88454,1388.95499,32.749511,25.086106,0.06813,7.700268e-24,33.002104,30.080479,156.474022,0.828059,0.074214
single_mutation,MatchByActiveStrategy,7.826855,58.678445,911.676325,40.881979,27.054417,0.046756,6.183401e-24,57.316537,21.474417,263.779576,0.757424,0.115744
single_mutation,MatchSMStrategy,8.167239,54.879725,3542.781214,7.404353,7.153494,0.108973,7.586752e-24,35.552978,12.945361,11.049427,0.981865,0.01686


# Overall stats

## Symbolic regression

In [31]:
sr = symreg.stats().groupby(['m','s','gems']).agg(['mean', 'count'])
sr.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,columns,columns,match_checks,match_checks,match_count,match_count,n_uses,n_uses,value,value,...,better_after,better_after,worse_after,worse_after,same_after,same_after,success_rate_counts,success_rate_counts,success_rate_checks,success_rate_checks
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count,mean,count,mean,count,mean,count,mean,count,...,mean,count,mean,count,mean,count,mean,count,mean,count
m,s,gems,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
point_mutation,MatchPMStrategy,5,51.683938,386,16459.059585,386,354.279793,386,19.163212,386,0.066657,386,...,111.564767,386,25.155959,386,3.477591,386,0.764082,382,0.017569,386
point_mutation,MatchPMStrategy,10,58.856153,577,17418.15078,577,524.44714,577,18.057192,577,0.061172,577,...,138.94922,577,34.483102,577,7.028423,577,0.680807,570,0.01349,577
probabilistic_mutation,MatchByActiveStrategy,5,51.954262,481,1363.081081,481,30.665281,481,25.12474,481,0.069503,481,...,32.448545,481,28.679002,481,148.591892,481,0.861073,451,0.080789,481
probabilistic_mutation,MatchByActiveStrategy,10,55.600739,541,1411.959335,541,34.602588,541,25.051756,541,0.066909,541,...,33.49427,541,31.326525,541,163.481978,541,0.798865,510,0.068336,538
single_mutation,MatchByActiveStrategy,5,57.674797,615,839.001626,615,36.822764,615,27.055285,615,0.050043,615,...,50.364878,615,16.659593,615,230.226341,615,0.806062,614,0.117035,615


## Binary classification

In [35]:
bc = bincls.stats().groupby(['m','s','gems']).agg(['mean', 'count'])

## Santa Fe

In [36]:
sf = santaf.stats().groupby(['m','s','gems']).agg(['mean', 'count'])

In [39]:
stats = pd.DataFrame()
for prep, d in zip(['SYM', 'BIN', 'SAN'], [sr, bc, sf]):
    stats[f'{prep}-count']   = d['success_rate_checks']['count'].values/60
  #  stats[f'{prep}-sucrate'] = d['success_rate_checks']['mean'].values
    stats[f'{prep}-better']  = d['better_after']['mean'].values
    stats[f'{prep}-worse']   = d['worse_after']['mean'].values
    stats[f'{prep}-same']   = d['same_after']['mean'].values
stats.index = sr.index
#stats.groupby(['m','s']).mean()
stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SYM-count,SYM-better,SYM-worse,SYM-same,BIN-count,BIN-better,BIN-worse,BIN-same,SAN-count,SAN-better,SAN-worse,SAN-same
m,s,gems,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
point_mutation,MatchPMStrategy,5,6.433333,111.564767,25.155959,3.477591,5.416667,70.653692,6.966308,2.644769,5.783333,85.839337,10.336455,10.430692
point_mutation,MatchPMStrategy,10,9.616667,138.94922,34.483102,7.028423,8.05,80.647619,10.168944,4.606522,11.3,136.117526,24.823196,14.78299
probabilistic_mutation,MatchByActiveStrategy,5,8.016667,32.448545,28.679002,148.591892,7.15,7.878322,7.28042,183.446154,6.583333,7.260633,6.886709,138.927342
probabilistic_mutation,MatchByActiveStrategy,10,8.966667,33.49427,31.326525,163.481978,7.833333,8.329255,8.446277,207.29734,9.633333,6.858997,9.147578,143.980969
single_mutation,MatchByActiveStrategy,5,10.25,50.364878,16.659593,230.226341,9.6,19.18533,4.373351,266.899913,8.25,60.115524,12.422177,150.577923
single_mutation,MatchByActiveStrategy,10,13.316667,62.660625,25.175812,289.573625,10.916667,22.875496,8.203969,298.210153,13.916667,91.820441,24.878605,190.181764
single_mutation,MatchSMStrategy,5,5.333333,29.180313,8.499375,7.469062,4.883333,11.880205,1.664676,4.795904,5.516667,48.29139,9.236858,9.899698
single_mutation,MatchSMStrategy,10,9.216667,39.240597,15.518083,13.121248,8.75,17.849429,2.053048,7.49,10.65,80.863417,15.366069,15.303354


In [40]:
for i, r in stats.iterrows():
    print('{} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} & {:.2f} \\\\'.format(r.name[2], *r.values))

5 & 6.43 & 111.56 & 25.16 & 3.48 & 5.42 & 70.65 & 6.97 & 2.64 & 5.78 & 85.84 & 10.34 & 10.43 \\
10 & 9.62 & 138.95 & 34.48 & 7.03 & 8.05 & 80.65 & 10.17 & 4.61 & 11.30 & 136.12 & 24.82 & 14.78 \\
5 & 8.02 & 32.45 & 28.68 & 148.59 & 7.15 & 7.88 & 7.28 & 183.45 & 6.58 & 7.26 & 6.89 & 138.93 \\
10 & 8.97 & 33.49 & 31.33 & 163.48 & 7.83 & 8.33 & 8.45 & 207.30 & 9.63 & 6.86 & 9.15 & 143.98 \\
5 & 10.25 & 50.36 & 16.66 & 230.23 & 9.60 & 19.19 & 4.37 & 266.90 & 8.25 & 60.12 & 12.42 & 150.58 \\
10 & 13.32 & 62.66 & 25.18 & 289.57 & 10.92 & 22.88 & 8.20 & 298.21 & 13.92 & 91.82 & 24.88 & 190.18 \\
5 & 5.33 & 29.18 & 8.50 & 7.47 & 4.88 & 11.88 & 1.66 & 4.80 & 5.52 & 48.29 & 9.24 & 9.90 \\
10 & 9.22 & 39.24 & 15.52 & 13.12 & 8.75 & 17.85 & 2.05 & 7.49 & 10.65 & 80.86 & 15.37 & 15.30 \\


# Symbolic regression

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

muts = [
    (point_mutation, MatchPMStrategy, 'Point mutation\nMatch all'),
    (single_mutation, MatchSMStrategy, 'Single mutation\nMatch all'),
    (single_mutation, MatchByActiveStrategy, 'Single mutation\n Match active'),
    (probabilistic_mutation, MatchByActiveStrategy, 'Prob. mutation\n Match active')
]

 
fig, axs = plt.subplots(1, len(muts), figsize=(16, 3))

data = pd.concat([symreg.stats(), bincls.stats(), santaf.stats()])
#plt.style.use('seaborn')
#  for ax, (m, s, t) in zip(axs, muts):
#      d = data[(data.m == m.__name__) & (data.s ==s.__name__)]['n_uses'].fillna(0)
#      d = sc.fit_transform(d)
#      sns.distplot(d, ax=ax)
#      ax.set_title(t)
print(data[data.m == 'point_mutation'].shape)
data['success_rate_checks'].reshape(2797,4)


In [None]:
data = pd.concat([symreg.stats(), bincls.stats(), santaf.stats()])
fig, axs = plt.subplots(1, len(muts), figsize=(16, 3))
for ax, (m, s) in zip(axs, muts):
        sns.distplot(data[(data.m == m.__name__) & (data.s ==s.__name__)]['n_uses'], ax=ax)
        ax.set_xlim(0,30)

In [None]:
densities(symreg, 'single_mutation', 'MatchSMStrategy')

In [None]:
densities(symreg, 'single_mutation', 'MatchByActiveStrategy')

In [None]:
densities(symreg, 'probabilistic_mutation', 'MatchByActiveStrategy')

In [None]:
densities(symreg, 'point_mutation', 'MatchPMStrategy')

# Binary classification

In [None]:
densities(bincls, 'single_mutation', 'MatchSMStrategy')

In [None]:
densities(bincls, 'single_mutation', 'MatchByActiveStrategy')

In [None]:
densities(bincls, 'probabilistic_mutation', 'MatchByActiveStrategy')

In [None]:
densities(symreg, 'point_mutation', 'MatchPMStrategy')