# Google Hashcode 2019 Final Round: Compiling Google

In [1]:
import numpy as np
import io
import os
import pathlib
import networkx as nx
import matplotlib.pyplot as plt
import statistics

In [2]:
config InlineBackend.figure_format = 'retina'

## Utility Functions

In [3]:
def open_input(filename):
    
    path = str(pathlib.Path().absolute()) + "/dataset/" + filename

    # Read file
    input_file = io.open(path, mode='r')
    # read all the lines 
    lines = input_file.readlines()
    input_file.close()

    # Parsing file information
    # save number of file, target and server 
    [num_file,num_target,num_server] = np.fromstring(lines[0], dtype=int, sep=' ')

    # conversion of read lines from list of string to list of lists 
    lines = [i.strip("[]\n").split(" ") for i in lines]

    # blank dicts
    time = {}
    dep = {}
    target = {}

    # build time and dep dicts
    row=1

    while row <= num_file*2:
        # save compilation and replication time
        time.update({lines[row][0] : [lines[row][1], lines[row][2]]})
        row_dep=row+1
        # check file dependencies
        if lines[row_dep][0]== "0":
            # no dependencies
            dep.update({lines[row][0]: []})
        else:
            # save file dependencies
            num_dep=int(lines[row_dep][0])
            dep.update({lines[row][0]: lines[row_dep][-num_dep:]})

        row+=2

    # build target dict     
    row=1+num_file*2

    while row <= num_target+(num_file*2):
        # save deadline and score
        target.update({lines[row][0] : [lines[row][1], lines[row][2]]})
        row+=1;

    # conversion dicts lists from string to int
    time = dict((k,list(map(int,v))) for k,v in time.items())
    target = dict((k,list(map(int,v))) for k,v in target.items())

    # generate file list
    file=list(time.keys()) # VERIICARE SE è USATO

    return num_file, num_target, num_server, file, time, dep, target

In [4]:
# #creates the matrix of execution sequences
# def create_sequence(num_server, target, dep): 
    
#     # INPUT
#     # num_serv: number of servers
#     # target: dictionary of target's files
#     # dep: dictionary of file's dependencies
    
#     # OUTPUT
#     # matrix: matrix of execution sequences
    
    
#     # number of files target
#     num_target = len(target)
#     # list of target files
#     target_keys = list(target.keys())
#     # declaration of emtpy matrix
#     matrix = []
    
   
#     matrix_dim = min(num_server,num_target)
#     # insert a target file for each matrix' row 
#     for i in range(matrix_dim):
#         row = []
#         row.append(target_keys[i])
#         matrix.append(row)
    
#     # declaration of empty list of nodes. Contains nodes that must be inserted in a matrix' row
#     nodes = []
#     # declaration of empty list of files. Contains files that have already been inserted in matrix
#     file_list = []
    
#     # continues filling the matrix with dependencies of file targets
#     for i in range(matrix_dim):
#         nodes.append(matrix[i][0])
        
#         while len(nodes) != 0:
#             dep_list = dep[nodes[0]]
            
#             for f in dep_list:
#                 # check whether the dependency file is already being compiled on a server or not
#                 if not(f in file_list):
#                     matrix[i].insert(0,f)
#                     file_list.append(f)
#                     nodes.append(f)
                    
#             nodes.pop(0)  
        
#     return matrix

In [5]:
# # creates list of target files that do not generates dependencies
# def find_target(target, dep):
#     no_dep_tar = []
    
#     for key_tar in target:
#         no_dep = True
        
#         for key_dep in dep:
#             if key_tar in dep[key_dep]:
#                 no_dep = False
#                 break
                
#         if no_dep:
#             no_dep_tar.append(key_tar)
#     return no_dep_tar

In [7]:
def get_longest_path(graph):
    # Get the longest path using negative weights and Bellman-Ford algorithm
    distance, path = nx.algorithms.shortest_paths.weighted.single_source_bellman_ford(graph, "s", weight="n_weight")
    # change the color of the edge on the path
    for key,value in path.items():
        if len(value)>1:
            for i in range(len(value)-1):
                graph[value[i]][value[i+1]]["color"] = "red"
    
    return graph, distance, path;

In [8]:
def draw_graph(graph):
    # Draw the given graph
    fig = plt.figure(figsize=(10,6))
    ax = plt.axes()
    # graph layout
    pos = nx.planar_layout(graph)
    # draw nodes
    cmap = plt.get_cmap('Pastel1')
    # https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html <----- visit here for others color maps
    colors = [cmap(i) for i in np.linspace(0, 1, len(sequence)+1)]
    for i,server in enumerate(sequence):
        nx.draw_networkx_nodes(graph, pos, ax=ax, nodelist=server, node_color=[colors[i]], alpha=1, edgecolors="black")
    nx.draw_networkx_nodes(graph, pos, ax=ax, nodelist=["s","t"], node_color=[colors[len(sequence)]], alpha=1, edgecolors="black")
    nx.draw_networkx_labels(graph, pos, ax=ax);
    # draw edges
    color = nx.get_edge_attributes(graph,'color')
    nx.draw_networkx_edges(graph, pos, ax=ax, width=1.0, alpha=0.8, edge_color=color.values())
    edge_labels = nx.get_edge_attributes(graph,'weight')
    nx.draw_networkx_edge_labels(graph, pos, ax=ax, edge_labels=edge_labels, label_pos=0.5)

    plt.show()
    
    return graph

In [9]:
def get_score(distance, target, time):
    # Compute the score of the solution
    score = 0
    good_target=[]
    bad_target=[]
    
    for file in target:
        # check if the file target is in the given solution
        if file in distance:
            d = target[file][0]
            x = -distance[file]+time[file][0]
            # check if the file is compiled before the deadline
            if x<=d:
                g = target[file][1]
                score = score + (d-x + g)
                good_target.append(file)
#                 print(file)
            else:
                bad_target.append(file)
            
    return score, good_target, bad_target

In [10]:
def write_file(distance, graph):
    
    # INPUT
    # distance: dictionary of distances for current execution sequence
    # graph: graph of current execution sequence
    
    # OUTPUT
    # creates/overwrites file in path "./output/submission_file.txt"
    

    # convert distances in positive values
    distance = dict((k,abs(v)) for k,v in distance.items())
    #convert distance dictionary to list
    distance_list = sorted(distance.items(), key=lambda x: x[1])
    
    # checking wheater output's folder exists
    path_dir=str(pathlib.Path().absolute())+ "/output"
    if not os.path.exists(path_dir):
        os.mkdir(path_dir)  
    
    #opening file
    path = path_dir + '/submission_file.txt'
    file_object = open(path, 'w')
    
    # writing file
    file_object.write(str(len(distance_list)-2)+ '\n')
    print(str(len(distance_list)-2))
    
    for i in distance_list:
        if(i[0] != 's' and i[0] != 't'):
            file = i[0]
            time = i[1]
            server = graph.nodes[file]['server']
            file_object.write(file + " " + str(server) + '\n')
            print(file + " " + str(time) + " " + str(server))

    file_object.close()

In [11]:
def build_dep_graph(dep, target):
    
    # Define the directed dependencies graph
    dep_graph = nx.DiGraph()
    
    # Add the nodes of the graph
    dep_graph.add_nodes_from(list(dep.keys()), target=False, priority=0)
    
    # Add the edges of the graph
    for file, dep_list in dep.items():
        for d in dep_list:
            dep_graph.add_edge(d, file)
    
    # Set target attribute
    for t in target:
        if target[t][0]>time[t][0]:
            dep_graph.nodes[t]['target'] = True
            dep_graph.nodes[t]['priority'] = target[t][1]/target[t][0]              
            a = 1
            b = 1
#             dep_graph.nodes[t]['priority'] = a*target[t][1]-b*target[t][0]
    return dep_graph

In [12]:
def draw_dep_graph(dep_graph):
    
    # Draw the dependencies graph
    fig = plt.figure(figsize=(10,6))
    ax = plt.axes()
    # graph layout
    pos = nx.planar_layout(dep_graph)
    # draw nodes
    cmap = plt.get_cmap('Pastel1')
    nx.draw_networkx_nodes(dep_graph, pos, ax=ax, nodelist=[n for n in dep_graph if n not in target], node_color=[cmap(1)], alpha=1, edgecolors="black")
    nx.draw_networkx_nodes(dep_graph, pos, ax=ax, nodelist=[n for n in dep_graph if n in target], node_color=[cmap(0)], alpha=1, edgecolors="black")
    nx.draw_networkx_labels(dep_graph, pos, ax=ax);
    # draw edges
    nx.draw_networkx_edges(dep_graph, pos, ax=ax, width=1.0, alpha=0.8, edge_color="black")

    plt.show()

In [13]:
def priority_mapping(node):
    return -dep_graph.nodes[node]['priority']

In [14]:
def place_files(pred, server_time, sequence):
    min_ind = server_time.index(min(server_time)) 
    
    for file in pred:
        server_time[min_ind] = server_time[min_ind] + time[file][0]
        sequence[min_ind].append(file)

In [15]:
def place_files2(pred, server_time, sequence, num_server, predecessori):
    t=pred.pop()
    num_files=[0]*num_server
    for (i,s) in enumerate(sequence):
        for f in s:
            if f in predecessori[t]:
                num_files[i]= num_files[i]+1
#                 print("in comune" + f)
#     print(t, num_files)
    pred.append(t)
       
    max_value = max(num_files)
    if max_value == 0:
        max_ind = server_time.index(min(server_time))
    else:
        max_ind=num_files.index(max(num_files))
       
    for file in pred:
        server_time[max_ind] = server_time[max_ind] + time[file][0]
        sequence[max_ind].append(file)

In [16]:
def my_ancestors(G, source):
    if not G.has_node(source):
        raise nx.NetworkXError("The node %s is not in the graph." % source)
    anc = list(n for n, d in nx.shortest_path_length(G, target=source).items())
    anc.remove(source)
    return anc

In [17]:
def compile_greedy(dep_graph, sequence, server_time, real_target, predecessori):
    
    temp_graph = dep_graph
    
    toggle = True
    
    t_comp = []
    t_rep = []

    for f in file:
        if f not in real_target:
            t_comp.append(time[f][0])
            t_rep.append(time[f][1])

    t_comp_med = statistics.median(t_comp)
    t_rep_med = statistics.median(t_rep)    
    if t_rep_med > t_comp_med:
        print("tecnica 1")
    else:
        print("tecnica 2")

    while toggle:
        
        potential_target=[]
        for t in real_target:
            if t in temp_graph:
                pred = my_ancestors(temp_graph, t)
                no_target = True

                for p in pred:
                    if p in real_target:
                        no_target = False
                        break

                if no_target:
                    potential_target.append(t)
        
        if potential_target:
            toggle = True
            
            max_priority = dep_graph.nodes[potential_target[0]]['priority']
            first_target = potential_target[0]
#             print(potential_target)
            for t in potential_target:
                if dep_graph.nodes[t]['priority']>max_priority:
                    max_priority = dep_graph.nodes[t]['priority']
                    first_target = t        
            
            pred = my_ancestors(temp_graph, first_target)
            
            if pred:
#                 t_comp = []
#                 t_rep = []

#                 for f in predecessori[first_target]:
#                     t_comp.append(time[f][0])
#                     t_rep.append(time[f][1])

#                 t_comp_med = statistics.median(t_comp)
#                 t_rep_med = statistics.median(t_rep)

                if t_rep_med > t_comp_med:
#                     print("tecnica 1")
                    pred.reverse()
                    pred.append(first_target)

#                     place_files2(pred, server_time, sequence, num_server, predecessori) # place alternativa
                    place_files(pred, server_time, sequence)
                    temp_graph.remove_nodes_from([n for n in temp_graph if n in set(pred)])
                    
#                     for p in pred:
#                         print(p)
#                         file.remove(p)
                else:
#                     print("tecnica 2")
                    for d in dep[first_target]:
                        if d in temp_graph:
                            pred = my_ancestors(temp_graph, d)
                            pred.reverse()
                            pred.append(d)

#                             for p in pred:
#                                 file.remove(p)
                            place_files(pred, server_time, sequence)
                            temp_graph.remove_nodes_from([n for n in temp_graph if n in set(pred)])
                    
#                     place_files2([first_target], server_time, sequence, num_server, predecessori)
                    place_files([first_target], server_time, sequence) # place alternativa
                    temp_graph.remove_node(first_target) 
#                     file.remove(first_target)
            else:
#                 place_files2([first_target], server_time, sequence, num_server, predecessori)
                place_files([first_target], server_time, sequence)
                temp_graph.remove_node(first_target) # place alternativa
#                 file.remove(first_target)
               
        else: 
            toggle = False            

In [6]:
def build_graph(time, dep, sequence):
    # Define the directed graph of the files
    graph = nx.DiGraph()

    # Add the nodes of the graph
    for i,s in enumerate(sequence):
        graph.add_nodes_from(s, server=i+1)

    # Add the edges of the graph
    for s in sequence:    
        # add an edge between files on the same server
        for i in range(len(s)-1):
            w = time[s[i]][0]
            graph.add_edge(s[i], s[i+1], weight=w, n_weight=-w, color="black")
        # add dipendencies edges for each file on a server
        for i in range(len(s)):       
            dep_files = dep[s[i]]
            # check that the files has dipendencies
            if dep_files: 
                for f in dep_files:
                    # check that the file is on an other server
                    if f not in s:
                        w = time[f][0]+time[f][1]
                        graph.add_edge(f, s[i], weight=w, n_weight=-w, color="black")

    # Add two fake nodes
    graph.add_node("s", server=-1)
    graph.add_node("t", server=-1)

    for node in list(graph.nodes):
        if (node!="s" and node!="t"):
            # if node doesn't have in-edges, add edge s -> node
            if not graph.in_edges(node):
                graph.add_edge("s", node, weight=0, n_weight=0, color="black")
            # if node doesn't have out-edges, add edge node -> t
            elif not graph.out_edges(node):
                w = time[node][0]
                graph.add_edge(node, "t", weight=w, n_weight=-w, color="black")    
    
    return graph;

In [18]:
def compile_tabu(target):
    temp_target = target
    
    i = 0
    
    # da mettere nel main ed aggiustare
    predecessori = {}
#     dep_graph = build_dep_graph(dep, temp_target)
#     real_target = [t for t in temp_target if target[t][0]>time[t][0]]
#     for file in real_target:
#         predecessori[file] = my_ancestors(dep_graph,file)
    
    
    while True:
        real_target = [t for t in temp_target if target[t][0]>time[t][0]]
        
        dep_graph = build_dep_graph(dep, temp_target)
        
        server_time=[0]*num_server
        sequence = [[] for _ in range(num_server)]
        compile_greedy(dep_graph, sequence, server_time, real_target, predecessori)
        
        graph = build_graph(time, dep, sequence)

        graph, distance, path = get_longest_path(graph)
        score, good_target, bad_target = get_score(distance, target, time)
        
        print(score, len(good_target), len(bad_target))
#         print(sequence)
#         if i==0:
#             print(distance["ce7"])
        
        if not bad_target or i>50:
            break
        
        target_distance = {t:d for t,d in distance.items() if t in bad_target}
        target_tabu = max(target_distance, key=target_distance.get)
        temp_target.pop(target_tabu)
        i = i+1

# Main Algorithm

In [23]:
# num_file, num_target, num_server, file, time, dep, target = open_input("a_example.in")
# num_file, num_target, num_server, file, time, dep, target = open_input("b_narrow.in")
num_file, num_target, num_server, file, time, dep, target = open_input("c_urgent.in")
# num_file, num_target, num_server, file, time, dep, target = open_input("d_typical.in")
# num_file, num_target, num_server, file, time, dep, target = open_input("e_intriguing.in")
# num_file, num_target, num_server, file, time, dep, target = open_input("f_big.in")
compile_tabu(target)

tecnica 2
1063647 5 15
tecnica 2
1084257 5 14
tecnica 2
1109863 5 13
tecnica 2
1128129 5 12
tecnica 2
1191158 8 8
tecnica 2


KeyboardInterrupt: 

In [20]:
real_target = [t for t in target if target[t][0]>time[t][0]]

somma = 0
for rt in real_target:
    somma = somma + target[rt][1]
    
print(somma)

992418


In [21]:
dep_graph = build_dep_graph(dep, target)
# draw_dep_graph(dep_graph)

In [22]:
server_time=[0]*num_server
sequence = [[] for _ in range(num_server)]
compile_greedy(dep_graph, sequence, server_time, real_target)

TypeError: compile_greedy() missing 1 required positional argument: 'predecessori'

In [None]:
graph = build_graph(time, dep, sequence)
# graph = draw_graph(graph)

In [None]:
graph, distance, path = get_longest_path(graph)
# graph = draw_graph(graph)

In [None]:
score = get_score(distance, target, time)
score

In [None]:
file

# DOUBTS
* Ci sono dei file non target che non servono a nessun file target:
* Ci sono dei target che hanno tempo di compilazione maggiore della deadline

In [None]:
print(len(target))
real_target = [t for t in target if target[t][0]>time[t][0]]
print(len(real_target))