In [13]:
import json
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
import time


from src.experiments.aux_code import *
from datasets.get_datasets import *
from src.brkga.genetic import genetic as brkga
from src.brkga_variation.genetic import genetic as brkga_var
from src.s_genetic.genetic import *
from src.predicate import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
n_rounds = 5

In [15]:
def get_k_best_individuals(population, k):
    best_ind = []
    all_fitness = []
    for i in range(0, len(population)):
        all_fitness.append((population[i].fitness.values[0], i))
    
    sorted_ind = sorted(all_fitness, reverse=False, key=lambda tup: tup[0])
    for i in range(0, k):
        best_ind.append(population[sorted_ind[i][1]])
    return best_ind

In [16]:
source='workedunder'
target='advisedby'

bk = json.loads(open('src/experiments/kb.txt').readline())
kb_source = bk['imdb']
kb_target = bk['uwcse']

pred_target = create_pred_target(kb_target)

In [17]:
def get_train_neg_pos(source, target, source_pred, target_pred):
    train_neg = []
    train_pos = []

    with open(f'groot_experiments/{source}_{target}_{source_pred}_{target_pred}/neg.txt', 'r') as f:
        train_neg = json.loads(f.readline())

    with open(f'groot_experiments/{source}_{target}_{source_pred}_{target_pred}/pos.txt', 'r') as f:
        train_pos = json.loads(f.readline())

    return train_pos, train_neg

source_dataset = datasets.load('imdb', kb_source, target=source, seed=441773, balanced=0) #facts, pos, neg
target_dataset = datasets.load('uwcse', kb_target, target=target, seed=441773, balanced=0) #facts, pos, neg

train_pos, train_neg = get_train_neg_pos('imdb', 'uwcse', source, target)
facts =  [x for y in target_dataset[0] for x in y]
train_facts = [facts]*len(train_pos)

In [20]:
fold = 0

test = []
ttrain = []
test_pos = []
test_neg = [] 
test_facts = []
for index in range(0, len(train_pos)):
    if index == fold:
        ttrain = [train_pos[index], train_neg[index]]
        test_facts.extend(train_facts[index])
    else:
        test_pos.extend(train_pos[index])
        test_neg.extend(train_neg[index])
        test_facts.extend(train_facts[index])
test = [test_pos, test_neg, test_facts]
ttrain.append(test[2])

train_pos_gen = [ttrain[0], test[0]]
train_neg_gen = [ttrain[1], test[1]]
train_facts_gen = test[2]

len(train_facts_gen), len(train_facts[0])+len(train_facts[1])+len(train_facts[2])

(7161, 7161)

In [21]:
len(ttrain)

3

In [5]:
imdb_dataset = datasets.load('imdb', kb_source, target=source, seed=441773, balanced=0) #facts, pos, neg
uwcse_dataset = datasets.load('uwcse', kb_target, target=target, seed=441773, balanced=0) #facts, pos, neg

train_facts_source, train_pos_source, train_neg_source = get_train_division(imdb_dataset)
background_knowledge_src = boostsrl.modes(kb_source, [source], useStdLogicVariables=False, 
                                      maxTreeDepth=3, nodeSize=2, numOfClauses=8)
model_src = boostsrl.train(background_knowledge_src, train_pos_source, train_neg_source, train_facts_source, trees=10)

structured_src, new_src_struct = create_structured_trees(model_src)
train_pos, train_neg, train_facts = uwcse_dataset[1], uwcse_dataset[2], uwcse_dataset[0]

# Testing s_genetic

In [6]:
mutation_rate_list = [round(x, 3) for x in list(np.arange(0.001, 0.01, 0.001)) + [0.01]]
crossover_rate_list = [round(x, 2) for x in list(np.arange(0.6, 0.95, 0.05)) + [0.95]]

space  = [Categorical([10, 30, 50], name='num_individuals'),
          Categorical(mutation_rate_list, name='mutation_rate'),
          Categorical(crossover_rate_list, name='crossover_rate')]

In [7]:
@use_named_args(space)
def objective(**params):
    res = genetic(new_src_struct, target, source, train_pos, train_neg, 
                      train_facts, kb_source, kb_target, pred_target,
                      NUM_GEN=14, pop_size=params['num_individuals'], crossover=params['crossover_rate'],
                      mutation=params['mutation_rate'], crossover_type='tree_ind', revision='guided')

    return res[1][-1]


In [8]:
res_gp = gp_minimize(objective, space, n_calls=10, random_state=0)

"Best score=%.4f" % res_gp.fun



GENERATION:  0
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.28862920000000003, 'm_rec': 0.6155194, 'm_pred': 0.45293479999999997, 'm_f1': 0.3523716, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.022662028685887757, 's_rec': 0.31416578269544254, 's_prec': 0.1749249795882795, 's_f1': 0.1167764341305214}
GENERATION:  1
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.296183, 'm_rec': 1.0, 'm_pred': 0.31560689999999997, 'm_f1': 0.2316134, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.06006503444434207, 's_rec': 0.0, 's_prec': 0.18476269308356058, 's_f1': 0.025606993791540626}
GENERATION:  2
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.3198358, 'm_rec': 0.8762886, 'm_pred': 0.3578937, 'm_f1': 0.2780552, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 

'Best score=-0.3359'

In [15]:
print(f"BEST RESULT: {res_gp.x}")

BEST RESULT: [30, 0.009, 0.9]


In [10]:
crossover_rate = 0.9
mutation_rate = 0.009
num_ind = 30
for _round in range(0, n_rounds):
    print(f"ROUND {str(_round+1)}")
    res_s_genetic = genetic(new_src_struct, target, source, 
                    train_pos, train_neg, train_facts, 
                    kb_source, kb_target, pred_target,
                    NUM_GEN=14, pop_size=num_ind, 
                    mutation=mutation_rate, crossover=crossover_rate,
                    crossover_type='tree_ind', revision='guided')
    
    final_results = {}
    final_results[f'{source}->{target}'] = res_s_genetic
    
    individuals = get_k_best_individuals(res_s_genetic[0].population, 3)
    
    n_ind = 1
    for individual in individuals:
        rrefine = []
        rtransfer = []
        print("INDIVIDUO ", n_ind)
        refine, transfer = get_refine_transfer(individual, source, target, 'imdb', 'uwcse')
        rrefine.append(refine)
        rtransfer.append(transfer)
        res = []
        inf = []
        for i in range(len(train_pos)):
            ttrain = []
            test_neg = []; test_pos = []; test_facts = []
            for index in range(0, len(train_pos)):
                if index == i:
                    ttrain = [train_pos[index], train_neg[index][:2*len(train_pos[index])], train_facts[index]]
                else:
                    test_pos.extend(train_pos[index])
                    test_neg.extend(train_neg[index])
                    test_facts.extend(train_facts[index])
            test = [test_pos, test_neg, test_facts]
            res_ =  test_refine_transfer(kb_target, target, refine, transfer, ttrain, test)
            res.append(res_)

            thisFile = f'boostsrl/test/results_{target}.db'
            base = os.path.splitext(thisFile)[0]
            os.rename(thisFile, base + ".txt")
            tt = open(f'boostsrl/test/results_{target}.txt', 'r').readlines()
            final = []
            for i in tt:
                final.append(i.replace('\n', ''))
            inf.append(final)


        final_results[f'test:{source}->{target}'] = res
        final_results[f'refine:{source}->{target}'] = rrefine
        final_results[f'transfer:{source}->{target}'] = rtransfer
        final_results[f'inf:{source}->{target}'] = inf
        save_groot_results(f'groot_experiments/s_genetic/imdb_uwcse_{source}_{target}_{str(crossover_rate)}_{str(mutation_rate)}_{str(num_ind)}_14', n_ind, final_results, source, target)
        n_ind += 1




ROUND 1
GENERATION:  0
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.2854716, 'm_rec': 0.6155194, 'm_pred': 0.45293479999999997, 'm_f1': 0.3523716, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.01874678735783816, 's_rec': 0.31416578269544254, 's_prec': 0.1749249795882795, 's_f1': 0.1167764341305214}
GENERATION:  1
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.28665100000000004, 'm_rec': 0.7455194, 'm_pred': 0.40210419999999997, 'm_f1': 0.3113146, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.038291028280786614, 's_rec': 0.31187352407128116, 's_prec': 0.18357663485683573, 's_f1': 0.10921910838603291}
GENERATION:  2
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.29161319999999996, 'm_rec': 0.7455194, 'm_pred': 0.40210419999999997, 'm_f1': 0.3113146, 's_auc_pr': 0.02648161896561

# BRKGA

In [6]:
num_elite_list = [np.around(x, 2) for x in list(np.arange(0.1, 0.25, 0.05)) + [0.25]]
num_mutation_list = [np.around(x, 2) for x in list(np.arange(0.1, 0.3, 0.05)) + [0.3]]
mutation_rate_list = [np.around(x, 3) for x in list(np.arange(0.001, 0.01, 0.001)) + [0.01]]
crossover_rate_list = [np.around(x, 2) for x in list(np.arange(0.6, 0.95, 0.05)) + [0.95]]

space  = [Categorical([10, 30, 50], name='num_individuals'),
          Categorical(mutation_rate_list, name='mutation_rate'),
          Categorical(crossover_rate_list, name='crossover_rate'),
          Categorical(num_elite_list, name='num_elite'),
          Categorical(num_mutation_list, name='num_mutation')]

In [7]:
@use_named_args(space)
def objective(**params):
    res = brkga(new_src_struct, target, source, train_pos, train_neg, 
                      train_facts, kb_source, kb_target, pred_target,
                      NUM_GEN=14, pop_size=params['num_individuals'], crossover=params['crossover_rate'],
                      mutation=params['mutation_rate'], num_elite=params['num_elite'], 
                      num_mutation=params['num_mutation'])

    return res[1][-1]

In [8]:
res_gp = gp_minimize(objective, space, n_calls=10, random_state=0)

"Best score=%.4f" % res_gp.fun



GENERATION:  0
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.31754479999999996, 'm_rec': 0.6155194, 'm_pred': 0.45293479999999997, 'm_f1': 0.3523716, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.03564042269895239, 's_rec': 0.31416578269544254, 's_prec': 0.1749249795882795, 's_f1': 0.1167764341305214}
GENERATION:  1
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.3134792, 'm_rec': 0.6155194, 'm_pred': 0.45293479999999997, 'm_f1': 0.3523716, 's_auc_pr': 0.02648161896561462, 's_auc_roc': 0.0017075744200473397, 's_cll': 0.03103622969627592, 's_rec': 0.31416578269544254, 's_prec': 0.1749249795882795, 's_f1': 0.1167764341305214}
GENERATION:  2
MELHOR RESULTADO:  -0.3358574
BEST: {'m_auc_pr': 0.3358574, 'm_auc_roc': 0.936106, 'm_cll': -0.306419, 'm_rec': 0.7392308000000001, 'm_pred': 0.41064799999999996, 'm_f1': 0.3059298, 's_auc_pr': 0.02648161896561462, 's_auc_r

KeyboardInterrupt: 

In [None]:
print(f"BEST RESULT: {res_gp.x}")

In [18]:
for _round in range(0, n_rounds):
    print(f"ROUND {str(_round+1)}")
    res_brkga = brkga(new_src_struct, target, source, 
                    train_pos, train_neg, train_facts, 
                    kb_source, kb_target, pred_target,
                    NUM_GEN=14, pop_size=res_gp.x[0], 
                    mutation=res_gp.x[1], crossover=res_gp.x[2],
                    num_elite=res_gp.x[3], 
                    num_mutation=res_gp.x[4])
    
    final_results = {}
    final_results[f'{source}->{target}'] = res_brkga
    
    individuals = get_k_best_individuals(res_brkga[0].population, 3)
    
    n_ind = 1
    for individual in individuals:
        rrefine = []
        rtransfer = []
        print("INDIVIDUO ", n_ind)
        refine, transfer = get_refine_transfer(individual, source, target, 'imdb', 'uwcse')
        rrefine.append(refine)
        rtransfer.append(transfer)
        res = []
        inf = []
        for i in range(len(train_pos)):
            ttrain = []
            test_neg = []; test_pos = []; test_facts = []
            for index in range(0, len(train_pos)):
                if index == i:
                    ttrain = [train_pos[index], train_neg[index][:2*len(train_pos[index])], train_facts[index]]
                else:
                    test_pos.extend(train_pos[index])
                    test_neg.extend(train_neg[index])
                    test_facts.extend(train_facts[index])
            test = [test_pos, test_neg, test_facts]
            res_ =  test_refine_transfer(kb_target, target, refine, transfer, ttrain, test)
            res.append(res_)

            thisFile = f'boostsrl/test/results_{target}.db'
            base = os.path.splitext(thisFile)[0]
            os.rename(thisFile, base + ".txt")
            tt = open(f'boostsrl/test/results_{target}.txt', 'r').readlines()
            final = []
            for i in tt:
                final.append(i.replace('\n', ''))
            inf.append(final)


        final_results[f'test:{source}->{target}'] = res
        final_results[f'refine:{source}->{target}'] = rrefine
        final_results[f'transfer:{source}->{target}'] = rtransfer
        final_results[f'inf:{source}->{target}'] = inf
        save_groot_results(f'groot_experiments/brkga/imdb_uwcse_{source}_{target}_{str(res_gp.x[2])}_{str(res_gp.x[1])}_{str(res_gp.x[0])}_14_{str(res_gp.x[3])}_{str(res_gp.x[4])}', n_ind, final_results, source, target)
        n_ind += 1




ROUND 1




GENERATION:  0
MELHOR RESULTADO:  -0.1312138
GENERATION:  1
MELHOR RESULTADO:  -0.1312138
GENERATION:  2
MELHOR RESULTADO:  -0.1312138
GENERATION:  3
MELHOR RESULTADO:  -0.06560679999999999
GENERATION:  4
MELHOR RESULTADO:  -0.1312138
GENERATION:  5
MELHOR RESULTADO:  -0.1312138
GENERATION:  6
MELHOR RESULTADO:  -0.1312138
GENERATION:  7
MELHOR RESULTADO:  -0.1312138
GENERATION:  8
MELHOR RESULTADO:  -0.1312138
GENERATION:  9
MELHOR RESULTADO:  -0.1312138
GENERATION:  10
MELHOR RESULTADO:  -0.1312138
GENERATION:  11
MELHOR RESULTADO:  -0.1312138
GENERATION:  12
MELHOR RESULTADO:  -0.1312138
INDIVIDUO  1
INDIVIDUO  2
INDIVIDUO  3
ROUND 2
GENERATION:  0
MELHOR RESULTADO:  -0.1312138
GENERATION:  1
MELHOR RESULTADO:  -0.1312138
GENERATION:  2
MELHOR RESULTADO:  -0.1312138
GENERATION:  3
MELHOR RESULTADO:  -0.1312138
GENERATION:  4
MELHOR RESULTADO:  -0.1312138
GENERATION:  5
MELHOR RESULTADO:  -0.1312138
GENERATION:  6
MELHOR RESULTADO:  -0.1312138
GENERATION:  7
MELHOR RESULTADO:  -0.131

In [19]:
res_brkga

(<src.brkga.population.Population at 0x7fb051a99a58>,
 [-0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.06560679999999999,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138,
  -0.1312138],
 [{'m_auc_pr': 0.3358574,
   'm_auc_roc': 0.936106,
   'm_cll': -0.3296152,
   'm_rec': 0.29125959999999995,
   'm_pred': nan,
   'm_f1': nan,
   's_auc_pr': 0.02648161896561462,
   's_auc_roc': 0.0017075744200473397,
   's_cll': 0.016243050531227182,
   's_rec': 0.1466181807452268,
   's_prec': nan,
   's_f1': nan},
  {'m_auc_pr': 0.3358574,
   'm_auc_roc': 0.936106,
   'm_cll': -0.3296152,
   'm_rec': 0.29125959999999995,
   'm_pred': nan,
   'm_f1': nan,
   's_auc_pr': 0.02648161896561462,
   's_auc_roc': 0.0017075744200473397,
   's_cll': 0.016243050531227182,
   's_rec': 0.1466181807452268,
   's_prec': nan,
   's_f1': nan},
  {'m_auc_pr': 0.3358574,
   'm_auc_roc': 0.936106,
   'm_cll': -0.3114298,
   'm_rec':