In [1]:
import numpy as np
from utils_predictions_manipulation import*
from utils_nans_manipulation import*
from cross_validation import*
from utils_data_loading import*
from utils_features_manipulation import*
import random

## Load data

In [2]:
traindata,_ = load_data('Data/train.csv')
X_total, Y_total = structure_data(traindata)

# Replacing undefined data with NaNs
X_nans = replace_bad_data_with_nans(X_total, -999)

X_nans, col = replace_nans_with_median(X_nans, threshold=0.5)

X_nans = X_nans + np.random.rand(X_nans.shape[0], X_nans.shape[1])/10000

In [7]:
#Baseline
dtmp_tr,dtmp_te = cross_validation(Y_total, X_nans, k_fold=4, seed=1, function_name='least_squares')

dtmp_tr

0.733888

## Setup genetic pool

In [3]:
degree_test = [1/30, 1/20, 1/10, 1/5, 1/4, 1/3, 1/2, 1, 2, 3, 4, 5, 10, 20, 30]
deg_all = all_combinations_list(degree_test)


In [4]:
index_test = list(range(X_nans.shape[1]))
ind_all = all_combinations_list(index_test)


In [75]:

days = 20

size = 30
champ_surv = 10 #Can be greater than size
pop_surv = int(size/10)
mutations = int(size/10)

complexity_deg = 10000
complexity_ind = 10000

deg_pool = deg_all[0:complexity_deg]
ind_pool = ind_all[0:complexity_ind]
champions = []

for day in range(days):
    print("\n Day:", day)
    
    #Generate population
    population = []
    for i in range(size):
        expression = random.choice(ind_pool)
        genes = [random.choice(deg_pool) for i in index_test]
        population.append([expression, genes])
    
    #Test reproductive success
    success = []
    for individual in population:
        X_poly = build_poly_index(X_nans, individual[0], individual[1])
        dtmp_tr,dtmp_te = cross_validation(Y_total, X_poly, k_fold=4, seed=1, function_name='least_squares')
        success.append(dtmp_tr)
        print(len(individual[0]), len(individual[1]), dtmp_tr)
    print("Average", np.mean(success))
    
    #Select best performing member for posteriory
    champions.append([max(success), population[success.index(max(success))]])
    
    #Select top 10% individuals
    survive = np.argsort(success)[-pop_surv:]
    survivors = [population[live] for live in survive]

    #Extract their features
    ind_pool = [individual[0] for individual in survivors]
    deg_pool = [gene for individual in survivors for gene in individual[1]]

    #Add fresh material to the gene pool
    for i in range(mutations):
        deg_pool.append(random.choice(deg_all[0:complexity_deg])) #Mutation number much smaller than deg_pool size
        ind_pool.append(random.choice(ind_all[0:complexity_ind]))
        
    #Select best champions:
    if len(champions)>champ_surv:
        best_champs = np.argsort([ind[0] for ind in champions])[-champ_surv:]
        champions = [champions[i] for i in best_champs]
    print(len(champions))
    
    #Add champion values ot pool
    ind_pool.extend([ind[1][0] for ind in champions])
    deg_pool.extend([gene for ind in champions for gene in ind[1][1]])


 Day: 0
4 24 0.657332
4 24 0.657332
3 24 0.7390853333333334
4 24 0.7511506666666667
4 24 0.657332
Average 0.6924464
1

 Day: 1
4 24 0.7511026666666667
4 24 0.7489466666666666
4 24 0.657332
3 24 0.7380733333333334
4 24 0.7518813333333334
Average 0.7294672
2

 Day: 2
4 24 0.7525133333333334
4 24 0.7520146666666666
4 24 0.7528466666666668
4 24 0.7520399999999999
4 24 0.749852
Average 0.7518533333333334
3

 Day: 3
4 24 0.7502759999999999
4 24 0.7473599999999999
4 24 0.7505626666666667
4 24 0.7510706666666667
4 24 0.7527946666666667
Average 0.7504128
4

 Day: 4
4 24 0.7521493333333333
4 24 0.751264
4 24 0.7526826666666666
4 24 0.750272
4 24 0.7489239999999999
Average 0.7510584
5

 Day: 5
4 24 0.7495799999999999
4 24 0.752888
4 24 0.750812
4 24 0.7515306666666666
4 24 0.7521146666666667
Average 0.7513850666666666
6

 Day: 6
4 24 0.751724
4 24 0.7472346666666667
4 24 0.7514746666666667
4 24 0.7521306666666666
4 24 0.7514066666666667
Average 0.7507941333333333
7

 Day: 7
4 24 0.75231333333333

In [77]:
for ind in champions:
    print(ind[0])

0.7526506666666668
0.7526826666666666
0.7527186666666666
0.7527280000000001
0.7527946666666667
0.7528466666666668
0.752888
0.753
0.753056
0.753116
