# Read The problem data file:

In [52]:
import numpy as np
from numba import cuda, jit, int32, float32, int64
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
from math import pow, hypot, ceil
import random
import sys

In [53]:
class vrp():
    def __init__(self, capacity=None):
        self.capacity = capacity
        self.nodes = np.zeros((1,4), dtype=np.float32)
    def addNode(self, label, demand, posX, posY):
        newrow = np.array([label, demand, posX, posY], dtype=np.float32)
        self.nodes = np.vstack((self.nodes, newrow))

# Read the problem data file
def readInput():
	# Create VRP object:
    vrpManager = vrp()
	## First reading the VRP from the input ##
    print('Reading data file...', end=' ')
    fo = open('/home/conda_user/GA_VRP/test_set/P/P-n16-k8.vrp',"r")
    lines = fo.readlines()
    for i, line in enumerate(lines):
        while line.upper().startswith('CAPACITY'):
            inputs = line.split()
            vrpManager.capacity = np.float32(inputs[2])
			# Validating positive non-zero capacity
            if vrpManager.capacity <= 0:
                print(sys.stderr, 'Invalid input: capacity must be neither negative nor zero!')
                exit(1)
            break       
        while line.upper().startswith('NODE_COORD_SECTION'):
            i += 1
            line = lines[i]
            while not (line.upper().startswith('DEMAND_SECTION') or line=='\n'):
                inputs = line.split()
                vrpManager.addNode(np.int16(inputs[0]), 0.0, np.float32(inputs[1]), np.float32((inputs[2])))
                # print(vrpManager.nodes)
                i += 1
                line = lines[i]
                while (line=='\n'):
                    i += 1
                    line = lines[i]
                    if line.upper().startswith('DEMAND_SECTION'): break 
                if line.upper().startswith('DEMAND_SECTION'):
                    i += 1
                    line = lines[i] 
                    while not (line.upper().startswith('DEPOT_SECTION')):                  
                        inputs = line.split()
						# Validating demand not greater than capacity
                        if float(inputs[1]) > vrpManager.capacity:
                            print(sys.stderr,
							'Invalid input: the demand of the node %s is greater than the vehicle capacity!' % vrpManager.nodes[0])
                            exit(1)
                        if float(inputs[1]) < 0:
                            print(sys.stderr,
                            'Invalid input: the demand of the node %s cannot be negative!' % vrpManager.nodes[0])
                            exit(1)                            
                        vrpManager.nodes[int(inputs[0])][1] =  float(inputs[1])
                        i += 1
                        line = lines[i]
                        while (line=='\n'):
                            i += 1
                            line = lines[i]
                            if line.upper().startswith('DEPOT_SECTION'): break
                        if line.upper().startswith('DEPOT_SECTION'):
                            vrpManager.nodes = np.delete(vrpManager.nodes, 0, 0)                          
                            print('Done.')
                            return(vrpManager.capacity, vrpManager.nodes)

# Calculate cost table

In [54]:
## Calculate cost table:
@cuda.jit
def calc_cost_gpu(data_d, popsize, vrp_capacity, cost_table_d):
    threadId_row, threadId_col = cuda.grid(2)
    
#     data_d[threadId_row,0] = data_d[threadId_row,0] - 1
    
####ceil() is used instead of round() as the latter crashes the kernel.
####This causes +1 values in some cost distances

    if (threadId_row <= data_d.shape[0]-1) and (threadId_col <= data_d.shape[0]-1):
        cost_table_d[threadId_row, threadId_col] = ceil(hypot(data_d[threadId_row,2] - data_d[threadId_col,2],\
                                                              data_d[threadId_row,3] - data_d[threadId_col,3]))
#     popArr = initializePop(data, popsize, vrp_capacity, cost_table)

# Calculate fitness

In [55]:
@cuda.jit
def fitness_gpu(cost_table_d, pop, fitness_val_d):
    threadId_row, threadId_col = cuda.grid(2)
    fitness_val_d[threadId_row, 0] = 0
    pop[threadId_row, pop.shape[1]-1] = 1
    
    if threadId_col == ((pop.shape[1]-2)/2)-1:
        for i in range(pop.shape[1]):
            fitness_val_d[threadId_row, 0] += \
            cost_table_d[pop[threadId_row, i]-1, pop[threadId_row, i+1]-1]

        pop[threadId_row, pop.shape[1]-1] = fitness_val_d[threadId_row,0]
    
    cuda.syncthreads()

# Adjust individuals:

In [56]:
@cuda.jit(device=True)
def add_missing_nodes(r_flag, missing_d, missing_elements, pop):
    # Add missing nodes to every single individual:
    threadId_row, threadId_col = cuda.grid(2)    

    if threadId_col == ((pop.shape[1]-2)/2)-1:
        missing_elements[threadId_row,0] = True
        
        for i in range(missing_d.shape[1]):
            if missing_d[threadId_row, i] != 0:
                missing_elements[threadId_row, 0] = True
                for j in range(2, pop.shape[1]):
                    if pop[threadId_row, j] == r_flag:
                        pop[threadId_row, j] = missing_d[threadId_row, i]
                        missing_d[threadId_row, i] = 0
                        break
                    else:
                        missing_elements[threadId_row, i] = False

        if not missing_elements[threadId_row, 0]:
        # shift individual's elements to the left for every removed '1':
            for i in range(pop.shape[1], 0, -1):
                if pop[threadId_row, i] == r_flag:
                    for j in range(i, pop.shape[1]-1):
                        new_val = pop[threadId_row, j]
                        pop[threadId_row, j] = new_val
    cuda.syncthreads()

In [57]:
@cuda.jit(device=True)
def cap_adjust(vrp_capacity, data_d, pop):
    
    threadId_row, threadId_col = cuda.grid(2)
    
    if threadId_row <= pop.shape[0]-1 and threadId_col == ((pop.shape[1]-2)/2)-1:
#     if threadId_row <= pop.shape[0]-1 and threadId_col <= pop.shape[1]-1:
        reqcap = 0.0        # required capacity
        for i in range(2, pop.shape[1]-1):
            if pop[threadId_row, i] > 1:
                reqcap += data_d[pop[threadId_row, i]-1, 1] # index starts from 0 while individuals start from 1
                if reqcap > vrp_capacity:
                    # Shift individual's elements to the right for every inserted '1':
                    new_val = 1
                    rep_val = pop[threadId_row, i]

                    for j in range(i, pop.shape[1]-1):
                        pop[threadId_row, j] = new_val
                        new_val = rep_val
                        rep_val = pop[threadId_row, j+1]
                    reqcap = 0.0                    
            else:
                reqcap = 0.0
    cuda.syncthreads()

In [58]:
@cuda.jit(device=True)
def adjust_gpu(data_d, vrp_capacity, cost_table_d, missing_d, missing_elements,\
               pop, fitness_val_d):
    
    threadId_row, threadId_col = cuda.grid(2)
    # Remove duplicated elements from every single individual/row in population array:
    r_flag = 1 # A flag for removal/replacement

#    if threadId_row <= pop.shape[0]-1 and threadId_col <= pop.shape[1]-1\
#    and threadId_col > 1:
                    
#        for i in range(threadId_col-1, 1, -1):
#             if pop[threadId_row, threadId_col] != 0\
#             and pop[threadId_row, threadId_col] == pop[threadId_row, i]:
                
#                 pop[threadId_row, threadId_col] = r_flag 

    if threadId_row <= pop.shape[0]-1 and threadId_col == ((pop.shape[1]-2)/2)-1:
        for i in range(2, pop.shape[1]):
            for j in range(2, pop.shape[1]):
        
                if pop[threadId_row, i] != 0 \
                and pop[threadId_row, i] == pop[threadId_row, j] and i !=j:

                    pop[threadId_row, i] = r_flag 
                    break

        for i in range(1, data_d.shape[0]):
            for j in range(1, pop.shape[1]-1):
                if data_d[i,0] == pop[threadId_row,j]:
                    missing_d[threadId_row, i] = 0
                    break
                else:
                    missing_d[threadId_row, i] = data_d[i,0]
        
    add_missing_nodes(r_flag, missing_d, missing_elements, pop)   
    cap_adjust(vrp_capacity, data_d, pop)
    cuda.syncthreads()

# Initialize population:

In [59]:
@cuda.jit
def initializePop_gpu(rng_states, data_d, vrp_capacity, cost_table_d, missing_elements,\
                      missing_d, pop_d, fitness_val_d):
    
    threadId_row, threadId_col = cuda.grid(2)

    # Generate the individuals from the nodes in data_d:
    if threadId_col > 1 and threadId_col <= data_d.shape[0]:
        pop_d[threadId_row, threadId_col] = data_d[threadId_col-1, 0]
        
    pop_d[threadId_row, 0], pop_d[threadId_row, 1] = 1, 1
        
    # Randmly shuffle each individual on a separate thread:   
    col = 0
    if threadId_row <= pop_d.shape[0]-1 and threadId_col > 1 and threadId_col <= data_d.shape[0]:
        while col == 0:
            rnd = (xoroshiro128p_uniform_float32(rng_states, threadId_row*threadId_col)*(data_d.shape[0]-2))
            col = int(rnd)+2

        pop_d[threadId_row, threadId_col], pop_d[threadId_row, col] =\
        pop_d[threadId_row, col], pop_d[threadId_row, threadId_col]

    # Adjust individuals using adjust_gpu function:
    adjust_gpu(data_d, vrp_capacity, cost_table_d, missing_d, missing_elements,\
               pop_d, fitness_val_d)

# Evolve population

In [60]:
@cuda.jit(device=True)
def cut_points(pool_d, rng_states):
    
    threadId_row, threadId_col = cuda.grid(2)
    stride = 4
    row = threadId_row*stride
    no_cuts = pool_d[row+2, 2]
    
    if threadId_row < pool_d.shape[0] and threadId_col == ((pool_d.shape[1]-2)//2)-1:
        for i in range(1, no_cuts+1):
            rnd_val = 0
            rnd = xoroshiro128p_uniform_float32(rng_states, threadId_row)*(pool_d[row + 2, 1]-2)
            rnd_val = int(rnd)+2
            pool_d[row + 3, i+1] = rnd_val
#             pool_d[row + 3, threadId_col] = threadId_col
#     pool_d[row + 3, 0] = row
#     if threadId_col > 0:

    #if threadId_col == pool_d.shape[1]-15:
#     

            
#          # the four lines should be replaced by one line of a random number
        #pool_d[row + 3, i+1] = 7
        #pool_d[row + 3, i+2] = 5
        #pool_d[row + 3, i+3] = 12
        #break

# Crossover points (i.e., pool_d[row + 3,:]) must be sorted:           
#     if threadId_col == pool_d.shape[1]-2:
#    for i in range(1, no_cuts+1):
#        min_val = pool_d[row + 3, i]
#        min_index = i

#        for j in range(i + 1, no_cuts+1):
            # Select the smallest value
#            if pool_d[row + 3, j] < pool_d[row + 3, min_index]:
#                min_index = j

#        pool_d[row + 3, min_index], pool_d[row + 3, i] = pool_d[row + 3, i], pool_d[row + 3, min_index]

    cuda.syncthreads()
# ----------------------------------------------------------------------------------------------------------

@cuda.jit(device=True)
def swap_indices(pool_d, j):
    
    threadId_row, threadId_col = cuda.grid(2)
    stride = 4
    row = threadId_row*stride  
    
    pool_d[row, threadId_col], pool_d[row+1, threadId_col] =\
    pool_d[row+1, threadId_col], pool_d[row, threadId_col]
# ----------------------------------------------------------------------------------------------------------
@cuda.jit(device=True)
def assign_rnd(pool_d, pop_d, row_idx, rnd_val):
    
    threadId_row, threadId_col = cuda.grid(2)
    if threadId_row < pop_d.shape[0] and threadId_col > 1 and threadId_col < pool_d.shape[1]:
        pool_d[row_idx, threadId_col] = pop_d[rnd_val, threadId_col]
        pool_d[row_idx, 0] = rnd_val
    
    cuda.syncthreads()
# ----------------------------------------------------------------------------------------------------------
@cuda.jit(device=True)    
def cross_over(generations, popsize, opt, vrp_capacity, rng_states\
               , cost_table_d, cut_idx_d, pool_d, new_pop_d, pop_d):
    threadId_row, threadId_col = cuda.grid(2)
    stride = 4
    row = threadId_row*stride
        
    if threadId_row < pop_d.shape[0] and threadId_col < pop_d.shape[1] and threadId_col > 1:
    #   Create a pool of 4 randomly selected individuals:
        rnd_val = 0
        #rnd = xoroshiro128p_uniform_float32(rng_states, threadId_row)*(popsize-1)
        #rnd_val = int(rnd)
        assign_rnd(pool_d, pop_d, row + 0, rnd_val)
        
        
        rnd_val = 0
        #rnd = xoroshiro128p_uniform_float32(rng_states, threadId_row)*(popsize-1)
        #rnd_val = int(rnd)        
        assign_rnd(pool_d, pop_d, row + 1, rnd_val)
        
        rnd_val = 0
        #rnd = xoroshiro128p_uniform_float32(rng_states, threadId_row)*(popsize-1)
        #rnd_val = int(rnd)        
        assign_rnd(pool_d, pop_d, row + 2, rnd_val)
        
        rnd_val = 0
        #rnd = xoroshiro128p_uniform_float32(rng_states, threadId_row)*(popsize-1)
        #rnd_val = int(rnd)        
        assign_rnd(pool_d, pop_d, row + 3, rnd_val)

    # Selecting 2 parents with the binary tournament
    # The first two pool_d rows are re-assigned as parents:
    # ----------------------------1st Parent--------------------------------------------------
'''            
    if pool_d[row, -1] < pool_d[row + 1, -1]:
        pass
    else:
        pool_d[row, threadId_col] = pool_d[row + 1, threadId_col]

# ----------------------------2nd Parent--------------------------------------------------
    if pool_d[row + 2, -1] < pool_d[row + 3, -1]:
        pool_d[row + 1, threadId_col] = pool_d[row + 2, threadId_col]
    else:
        pool_d[row + 1, threadId_col] = pool_d[row + 3, threadId_col]

    pool_d[row + 2, -1] = 1
    pool_d[row + 3, -1] = 1
    if threadId_col < pop_d.shape[1]-1 and threadId_col > 1:
        pool_d[row + 2, threadId_col] = 0
        pool_d[row + 3, threadId_col] = 0

    # Performing Two-Point crossover and generating two children:
    # Calculate the actual length of parents, put it in pool_d
    if threadId_col == ((pop_d.shape[1]-2)/2)-1:
        for i in range(0, pop_d.shape[1]-2):
            if not (pool_d[row, i] == 1 and pool_d[row, i+1] == 1):
                pool_d[row + 2, pool_d.shape[1]-2] += 1
                
            if not (pool_d[row+1, i] == 1 and pool_d[row+1, i+1] == 1):
                pool_d[row + 3, pool_d.shape[1]-2] += 1

        # Minimum length of the two parents
        pool_d[row + 2, 1] = \
        min(pool_d[row+2, pool_d.shape[1]-2], pool_d[row+3, pool_d.shape[1]-2]) 

        # Select (n/5 - 1) random cutting points for crossover based on the shortest parent
        pool_d[row + 2, 2] = pool_d[row + 2, 1]//5 - 1 # number of cutting points
    
    if threadId_col < pop_d.shape[1]-1 and threadId_col > 1:
        cut_points(pool_d, rng_states)
    no_cuts = pool_d[row+2, 2]

    if no_cuts%2 == 1: # Number of cuts is odd
        for j in range(1, no_cuts+1):
            if threadId_col <= pool_d[row + 3, j] and threadId_col > pool_d[row + 3, j-1] and j%2 == 1:    
                swap_indices(pool_d, j)
    else:
        for j in range(1, no_cuts+1):
            if (threadId_col <= pool_d[row + 3, j] and threadId_col > pool_d[row + 3, j-1] and j%2 == 1) or\
            (threadId_col > pool_d[row + 3, j] and j == no_cuts):    
                swap_indices(pool_d, j)
                '''
# ----------------------------------------------------------------------------------------------------------
@cuda.jit(device=True)
def mutate(pool_d):
    
    threadId_row, threadId_col = cuda.grid(2)
    stride = 4
    row = threadId_row*stride
# Mutation: swapping two positions in the children, with 1:40 probability
    mutation_prob = 40
#     if random.randint(1, mutation_prob) == 1:
    
    if threadId_col == 1:    
# Repeat random selection if depot was selected:    
        i1 = 1
        while pool_d[row, i1] == 1:
            i1 = 4
    #         i1 = random.randint(0, len(ptomutate) - 2)

        i2 = 1
        while pool_d[row, i2] == 1:
            i2 = 15
    #         i2 = random.randint(0, len(ptomutate) - 2)


        pool_d[row, i1], pool_d[row, i2] = pool_d[row, i2], pool_d[row, i1]

    # Repeat for the second child:    
        i1 = 1
        while pool_d[row+1, i1] == 1:
            i1 = 4
    #         i1 = random.randint(0, len(ptomutate) - 2)

        i2 = 1
        while pool_d[row+1, i2] == 1:
            i2 = 16
    #         i2 = random.randint(0, len(ptomutate) - 2)

        pool_d[row+1, i1], pool_d[row+1, i2] = pool_d[row+1, i2], pool_d[row+1, i1]
        cuda.syncthreads()
# ----------------------------------------------------------------------------------------------------------

@cuda.jit
def evolvePop_gpu(count, generations, popsize, opt, vrp_capacity, rng_states, data_d, cost_table_d,\
                  cut_idx_d, pool_d, new_pop_d, pop_d, missing_d, missing_elements, fitness_val_d):
    
    # nodes represent the row/column index in the cost table
    threadId_row, threadId_col = cuda.grid(2)
    stride = 4
    row = threadId_row*stride    
    
    if threadId_col > 0:
        pool_d[row + 0, threadId_col] = 1
        pool_d[row + 1, threadId_col] = 1
        pool_d[row + 2, threadId_col] = 1
        pool_d[row + 3, threadId_col] = 1
    
    if threadId_row < pool_d.shape[0] and threadId_col < pop_d.shape[1] and threadId_col > 1:
        cross_over(generations, popsize, opt, vrp_capacity, rng_states, cost_table_d, cut_idx_d, pool_d,\
                   new_pop_d, pop_d)
#         mutate(pool_d)
        
#         new_pop_d[threadId_row, threadId_col] = pool_d[row, threadId_col]
        
#         adjust_gpu(data_d, vrp_capacity, cost_table_d, missing_d, missing_elements,\
#            new_pop_d, zeroed_indiv_d, fitness_val_d)
        
#     if threadId_col == 0:
#         new_pop_d[threadId_row, 0] = count

#     # Running the genetic algorithm
#     run_time = timer()
#     stucking_indicator = 0
#     for i in tqdm(range(iterations)):
#         old_best = pop[0][-1]
#         nextPop = []
#         nextPop_set = set()

#         elite_count = len(pop)//20      
#         sorted_pop = pop.copy()

# # Apply two-opt for the new top 5% individuals:
#         for idx, individual in enumerate(sorted_pop[:elite_count]):
#             if individual[0] >= i:
#                 sorted_pop[idx], cost = two_opt.two_opt(individual[1:-1], cost_table)
#                 sorted_pop[idx].append(9999)
#                 fitness_value = fitness(cost_table, sorted_pop[idx][:-1])
#                 sorted_pop[idx][-1] = (fitness_value)
#                 sorted_pop[idx].insert(0,individual[0])
        
#         sorted_pop.sort(key= lambda elem: elem[-1])
#         pop = sorted_pop.copy()
        
#         start_evolution_timer = timer()
# # terminate if optimal is reached or runtime exceeds 1h
#         if ((sorted_pop[0][-1] + extended_cost) > opt) and (timer() - run_time <= 60):
#             nextPop = sorted_pop[:elite_count] # top 5% of the parents will remain in the new generation         




# # Adjusting individuals               
#                 child1 = adjust(np.asarray(child1, dtype=np.float32), np.asarray(vrp_data, dtype=np.float32), vrp_capacity)
#                 child2 = adjust(np.asarray(child2, dtype=np.float32), np.asarray(vrp_data, dtype=np.float32), vrp_capacity)

#                 fitness_val = fitness(cost_table, child1[:-1])
#                 child1[-1] = fitness_val
                
#                 fitness_val = fitness(cost_table, child2[:-1])
#                 child2[-1] = fitness_val

#                 child1 = list(child1)
#                 child2 = list(child2)

# # Insert generation number at the beginning of every individual:
#                 child1.insert(0, i + 1)
#                 child2.insert(0, i + 1)

#                 # Add children to population iff they are better than parents
#                 if (child1[-1] < parent1[-1]) | (child1[-1] < parent2[-1]) | ((timer() - start_evolution_timer) > 30):
#                     nextPop_set.add(tuple(child1))
#                     # start_evolution_timer = timer()
#                     # nextPop_set.add(tuple(parent1))
                
#                 if (child2[-1] < parent1[-1]) | (child2[-1] < parent2[-1]) | ((timer() - start_evolution_timer) > 30):
#                     nextPop_set.add(tuple(child2))
#                     # start_evolution_timer = timer()
#                     # nextPop_set.add(tuple(parent2))   
                               
#             nextPop = list(nextPop_set)

#             # Updating population generation

#             # random.shuffle(nextPop)
#             nextPop = sorted(nextPop, key= lambda elem: elem[-1])

#             if nextPop[0][-1] == old_best:
#                 stucking_indicator += 1
#             else:
#                 stucking_indicator = 0

#             pop = nextPop
#             if not (i+1) % 5: # print population every 300 generations
#                 print(f'Population at generation {i+1}:{pop}\nBest: {pop[0][-1]}')
#         elif (timer() - run_time >= 60):
#             print('Time criteria is met')
#             break
#         elif (((sorted_pop[0][-1] + extended_cost) <= opt)):
#             print('Cost criteria is met')
#             break

# Main

In [62]:
vrp_capacity, data = readInput()
popsize = 100
generations = 5
opt = 450

data_d = cuda.to_device(data)
cost_table_d = cuda.device_array(shape=(data.shape[0], data.shape[0]), dtype=np.uint16)

POP = np.ones((popsize, 2*data.shape[0]+2), dtype=np.uint16)
pop_d = cuda.to_device(POP)

zeros = np.zeros(shape=(popsize, pop_d.shape[1]), dtype=np.uint16)
missing_d = cuda.to_device(zeros)

missing = np.ones(shape=(popsize,1), dtype=np.bool)
missing_elements = cuda.to_device(missing)

fitness_val = np.zeros(shape=(popsize,1), dtype=np.uint16)
fitness_val_d = cuda.to_device(fitness_val)

np.set_printoptions(threshold=sys.maxsize)

# GPU grid configurations:
threads_per_block = (20, 20)
blocks_no = ceil(max(popsize, 2*data.shape[0]+2)/threads_per_block[0])

blocks = (blocks_no, blocks_no)
rng_states = create_xoroshiro128p_states(threads_per_block[0]**2  * blocks[0]**2, seed=1)

# --------------Calculate the cost table----------------------------------------------
calc_cost_gpu[blocks, threads_per_block](data_d, popsize, vrp_capacity, cost_table_d)

# --------------Initialize population----------------------------------------------
initializePop_gpu[blocks, threads_per_block]\
                 (rng_states, data_d, vrp_capacity, cost_table_d, missing_elements,\
                  missing_d, pop_d, fitness_val_d)

# --------------Calculate fitness----------------------------------------------
# The fitness GPU function is called from the CPU for indexing reasons.
fitness_gpu[blocks, threads_per_block](cost_table_d, pop_d, fitness_val_d)

new_pop_d = cuda.to_device(POP)
pool = np.ones(shape=(4*popsize, pop_d.shape[1]), dtype=np.uint16)
pool_d = cuda.to_device(pool)
cut_idx = np.ones(shape=(pop_d.shape[1]), dtype=np.uint16) # 9999 to be replaced by ones only
cut_idx_d = cuda.to_device(cut_idx)

# --------------Evolve population for some generations----------------------------------------------
# count = 0
# while count <= generations:
#     evolvePop_gpu[blocks, threads_per_block]\
#              (count, generations, popsize, opt, vrp_capacity, data_d, cost_table_d, \
#               cut_idx_d, pool_d, new_pop_d, pop_d, missing_d, missing_elements, fitness_val_d)
#     cuda.synchronize()
#     count += 1

# print(pop_d.copy_to_host(), end='\n-----------------------\n')
# print(cost_table_d.copy_to_host(), end='\n')
# print(pop_d.copy_to_host()[0:5,:], end='\n-----------------------\n')
# print(new_pop_d.copy_to_host()[0:5,:], end='\n-----------------------\n')
# print(pool_d.copy_to_host()[0:5,:])
# print(data_d.copy_to_host(), end='\n')
# print(missing_elements.copy_to_host()[0:5,:], end='\n')
# fitness_gpu[blocks,threads_per_block](cost_table_d, adjusted_indiv, zeroed_indiv_d, fitness_val_d)
# print(fitness_val_d.copy_to_host()[:,0])
# print(cost_table_d.copy_to_host())
###############################################################################################
# Speed test of CPU and GPU versions of the function:
# cost_table = np.zeros((data.shape[0],data.shape[0]), dtype=np.int32)
# print(calc_cost(data, popsize, vrp_capacity, cost_table).shape)
# print('CPU time:')
# %timeit calc_cost(data, popsize, vrp_capacity, cost_table)
# print('GPU time:')
#%timeit calc_cost_gpu[blocks, threads_per_block](data_d, popsize, vrp_capacity, cost_table_d)
################################################################################################

Reading data file... Done.
[[  1   1   2  14   1   5   1   7   1   8   1   9   1  10   1   3   1  12
   13   1  15  16   1   6  11   4   1   1   1   1   1   1   1 640]
 [  1   1  14  13  10   1   2   1   9   1   4   1   7   1  12   5   1  16
    6   1  15   8   1  11   1   3   1   1   1   1   1   1   1 615]
 [  1   1  11   6  12   1   5   1   8  14  13   1   4  16  10   1   7   1
   15   1   9   1   2   1   3   1   1   1   1   1   1   1   1 599]
 [  1   1   7   1   4   6   1  16  14   8   1  10   1   9  12   1   2   1
    5   1   3   1  13  15   1  11   1   1   1   1   1   1   1 631]
 [  1   1  13   1   3   1  15   1   9   1   4   6   1   8  11   1   5  12
    1  16   1   7   1   2  10  14   1   1   1   1   1   1   1 608]
 [  1   1   8   1   3   1   4   6   1   5  16   1  10  11   1   7   1   9
   14   1  13  12   1   2   1  15   1   1   1   1   1   1   1 586]
 [  1   1  15   1   7   1   4   8   1  12  11   1   5  14   1  13   2   1
   16  10   6   1   3   1   9   1   1   1   1   1   1

# sub_main

In [413]:
count = 0
while count <= generations:
    rng_states = create_xoroshiro128p_states(4*popsize, seed=1)
    evolvePop_gpu[blocks, threads_per_block]\
             (count, generations, popsize, opt, vrp_capacity, rng_states, data_d, cost_table_d, \
              cut_idx_d, pool_d, new_pop_d, pop_d, missing_d, missing_elements, fitness_val_d)
    cuda.synchronize()
    count += 1

# print(new_pop_d.copy_to_host(), end='\n-----------------------\n')
print(pop_d.copy_to_host()[:,:22], pop_d.copy_to_host()[:4,-1], end='\n-----------------------\n')
# for j in range(3,pool_d.shape[0]//4,4):
print(pool_d.copy_to_host()[:99,:22])

[[ 1  1  2 14  1  4  1  5  1  7  1 11  8  1  9  1 10  1  3  1 12 13]
 [ 1  1 14 13 10  1  6  1  9  1  8  1  7  1  2 12  1  5  1  4 15  1]
 [ 1  1  2  6  1  3  1  5  1  8 16 11  1  7  1 10 14 13 12  1  9  1]
 [ 1  1  7  1  4  6  1 16 14  1  2 12  1  9  1 10  8  1  5  1  3  1]
 [ 1  1 13  1  3  1  4  1  5 12  1 14  8 11  1  6  1  9  1 15  1  7]
 [ 1  1  8  1  3  1  4  6  1  5 10  1 16 15  1  9  1 11  1  7  1 13]
 [ 1  1  4 16  1  7  1  9 12  1 11  5  1  6  8  1  2  1  3  1 15 10]
 [ 1  1 11  4  1  9  1  5 16  1 10  6  1  7  1 14  1  3  1 12  8  1]
 [ 1  1 15 12  1  8  2  1 13  1  5  1  4  1  9  1  6 16 14  1  7  1]
 [ 1  1 10 16  1  3  1  7  1  6 12  4  1  8 14  1  2 13  1 15 11  1]
 [ 1  1  9  1  6 12  1  3  1 14  8  1  7  1 15 11 10  1 13 16  1  5]
 [ 1  1  2  1  5  1  8  6 12  1 10 15  1  9  1  4 11  1  3  1 16 14]
 [ 1  1 14  4 11  1  8 10 16  1  9  1  5  1 15 13  1  7  1  2 12  1]
 [ 1  1 16  1  7  1 14  2  1  8  1  3  1  6 11  1  5  1  4 10  1 13]
 [ 1  1  8  6 10  1 16  4 12  1  9