# Set Parameter
- Attention = True
- Teacher Forcing Ratio = 0.5
- Dropout = 0.2
- Layer = 4
- Batch size = 128
- Learning rate = 0.001
- Hidden unit = 300
- Epochs = 20
- Data = ids
- Raw

# Import packages

import useful packages for experiments

In [1]:
import os
import argparse
import logging
import sys
import numpy as np
import json
import math
import time
import sqlite3
import regex as re
from tqdm import tqdm

import torch
from torch.optim.lr_scheduler import StepLR
import torchtext

os.chdir(os.path.dirname(os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname('__file__'))))))

from models.seq2seq import Seq2seq
from loss.loss import Perplexity
from optim.optim import Optimizer
from dataset import fields
from evaluator.predictor import Predictor
from util.helpers import apply_fix, vstack_with_right_padding, make_dir_if_not_exists
from util.helpers import InvalidFixLocationException, SubstitutionFailedException
from util.helpers import get_lines, extract_line_number, FailedToGetLineNumberException, _truncate_fix
from util.helpers import tokens_to_source, compilation_errors

In [2]:
LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, "info".upper()))

In [3]:
rnn = "lstm"
data_name = "ids"
data_type = "raw"
pretrained_dir_name = None
select = "1"
batch_size = 128
iteration = 5

train_path = "data/network_inputs/iitk-"+data_name+"-1189"+"/data_train.txt"
test_path = "data/network_inputs/iitk-"+data_name+"-1189"
config_path = "models/config.json"
inverse_vocab_path = "data/data_generator/target_vocab_reverse.json"

# Prepare dataset

In [4]:
max_len = 450
src = fields.SourceField()
tgt = fields.TargetField()
def len_filter(example):
    return len(example.src) <= max_len and len(example.tgt) <= max_len
train = torchtext.data.TabularDataset(
    path=train_path, format='tsv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter)
src.build_vocab(train)
tgt.build_vocab(train)
input_vocab = src.vocab
output_vocab = tgt.vocab

# Prepare loss

In [5]:
weight = torch.ones(len(tgt.vocab))
pad = tgt.vocab.stoi[tgt.pad_token]
loss = Perplexity(weight, pad)
if torch.cuda.is_available():
    loss.cuda()



In [6]:
optimizer = "Adam"
seq2seq = None
config_json = open(config_path).read()
config = json.loads(config_json)
config["max_len"] = 450
config["hidden_size"] = 300
config["rnn_cell"] = rnn
config["n_layers"] = 4
config["dropout_p"] = 0.2
config["embedding_size"] = 50
config["use_attention"] = True
config["position_embedding"] = None
config["use_memory"] = None
#config["seed"]= 1189
#config["pos_add"] = "cat"

print(json.dumps(config, indent=4))
    
save_path = (data_name
            + ("_att" if config["use_attention"] else "")
            + ("_with_pos_" + config["position_embedding"] if config["position_embedding"] is not None else "")
            + ("_cat" if config["pos_add"] == "cat" else "")
            + ("_use_stack" if config["use_memory"] == "stack" else "")
            + ("_use_queue" if config["use_memory"] == "queue" else "")
            + "_emb" + str(config["embedding_size"])
            + "_hidden" + str(config["hidden_size"])
            + ("_pretrained" if pretrained_dir_name is not None else ""))
print("Save_path : %s" % save_path)
        
if pretrained_dir_name is not None:
    pretrained_path = ("pretrained_weights/"+ data_name + "_" + pretrained_dir_name
            + ("_att" if config["use_attention"] else "")
            + ("_with_pos_" + config["position_embedding"] if config["position_embedding"] is not None else "")
            + ("_cat" if config["pos_add"] == "cat" else "")
            + ("_use_stack" if config["use_memory"] == "stack" else "")
            + ("_use_queue" if config["use_memory"] == "queue" else "")
            + "_emb" + str(config["embedding_size"])
            + "_hidden" + str(config["hidden_size"])
            + "_"+rnn+"_"+str(i))
    pretrained_pos_weight = np.load(pretrained_path+"/encoder_pos_weight.npy")
    seq2seq = Seq2seq(config, vocab_size, vocab_size, sos_id, eos_id,
                      pretrained_pos_weight)
else :
    seq2seq = Seq2seq(config, len(src.vocab), len(tgt.vocab), tgt.sos_id, tgt.eos_id)

{
    "max_len": 450,
    "embedding_size": 50,
    "hidden_size": 300,
    "input_dropout_p": 0,
    "dropout_p": 0.2,
    "n_layers": 4,
    "bidirectional": false,
    "rnn_cell": "lstm",
    "variable_lengths": true,
    "embedding": null,
    "update_embedding": true,
    "get_context_vector": false,
    "use_attention": true,
    "attn_layers": 1,
    "hard_attn": false,
    "position_embedding": null,
    "pos_add": "add",
    "use_memory": null,
    "memory_dim": 5
}
Save_path : ids_att_emb50_hidden300


In [7]:
fig_path = "log/test/" + save_path
if not os.path.isdir(fig_path):
    os.mkdir(fig_path)
fig_path = fig_path + "/" + rnn
if not os.path.isdir(fig_path):
    os.mkdir(fig_path)
database_path = fig_path + "/" + data_type
if not os.path.isdir(database_path):
    os.mkdir(database_path)
database = database_path + "/" + data_name + "_" + data_type + ".db"

# Initialize model

In [8]:
if torch.cuda.is_available():
    seq2seq.cuda()

for param in seq2seq.parameters():
    param.data.uniform_(-0.08, 0.08)
            
log_path = "log/pth/"+save_path +"_" + rnn + "_" + str(select) + "_model_save.pth"
seq2seq.load_state_dict(torch.load(log_path))
seq2seq.eval()

predictor = Predictor(seq2seq, input_vocab, output_vocab, output_vocab.stoi[train.fields['tgt'].pad_token])

# Get fix

In [9]:
def get_fix(program):
    tgt_seq = predictor.predict_batch(program)
    return tgt_seq

# Data Load

In [10]:
if data_type == 'raw':
    test_dataset = np.load(os.path.join(
        test_path, 'test_%s.npy' % (data_type))).item()
else:
    test_dataset = np.load(os.path.join(
        test_path, 'test_%s-%s.npy' % (data_type, data_name))).item()
    
print("test_{} data length : {}".format(data_type, sum([len(test_dataset[pid]) for pid in test_dataset])))

test_raw data length : 6978


# Create the table

In [11]:
conn = sqlite3.connect(database)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS programs (
                prog_id text NOT NULL,
                user_id text NOT NULL,
                prob_id text NOT NULL,
                code text NOT NULL,
                name_dict text NOT NULL,
                name_seq text NOT NULL,
                PRIMARY KEY(prog_id)
             )''')

c.execute('''CREATE TABLE IF NOT EXISTS iterations (
                prog_id text NOT NULL,
                iteration text NOT NULL,
                network text NOT NULL,
                fix text NOT NULL,
                PRIMARY KEY(prog_id, iteration)
             )''')

c.execute('''CREATE TABLE IF NOT EXISTS error_messages (
                prog_id text NOT NULL,
                iteration text NOT NULL,
                network text NOT NULL,
                error_message text NOT NULL,
                FOREIGN KEY(prog_id, iteration, network) REFERENCES iterations(prog_id, iteration, network)
             )''')

<sqlite3.Cursor at 0x7f28fb754730>

# Attempt to repair

In [12]:
sequences_of_programs = {}
fixes_suggested_by_network = {}

if data_name == 'typo':
    normalize_names = True
    fix_kind = 'replace'
else:
    assert data_name == 'ids'
    normalize_names = False
    fix_kind = 'insert'

# Remove line numbers

In [13]:
def remove_line_numbers(source):
    lines = source.count('~')
    for l in range(lines):
        if l >= 10:
            source = source.replace(list(str(l))[0] + " " + list(str(l))[1] + " ~ ", "", 1)
        else:
            source = source.replace(str(l) + " ~ ", "", 1)
    source = source.replace("  ", " ")
    return source.split()

# Apply edits

In [14]:
with open(inverse_vocab_path, "r") as json_file:
    inverse_vocab = json.load(json_file)

In [15]:
def apply_edits(source, edits):
    fixed = []
    inserted = 0
    insert_tok = [str(i) for i in range(1,111)]
    for i, edit in enumerate(edits):
        if i - inserted >= len(source):
            break
        if edit == '0':
            fixed.append(source[i - inserted])
        elif edit != '-1':
            fixed.append(inverse_vocab[str(int(edit)+1)])
            if edits[inserted] not in insert_tok:
                inserted += 1
    return fixed

# Predict

In [20]:
for problem_id, test_programs in tqdm(test_dataset.items()):
    sequences_of_programs[problem_id] = {}
    fixes_suggested_by_network[problem_id] = {}

    entries = []
    
    for program, name_dict, name_sequence, user_id, program_id in test_programs:
        sequences_of_programs[problem_id][program_id] = [program]
        fixes_suggested_by_network[problem_id][program_id] = []
        entries.append(
            (program, name_dict, name_sequence, user_id, program_id,))

        c.execute("INSERT OR IGNORE INTO programs VALUES (?,?,?,?,?,?)", (program_id,
                  user_id, problem_id, program, json.dumps(name_dict), json.dumps(name_sequence)))
            

    input_ = []
        
    for i, entry in enumerate(entries):
        _, _, _, _, program_id = entry

        tmp = sequences_of_programs[problem_id][program_id][-1]
        input_.append(remove_line_numbers(tmp))

        cnt = 0
        fixes = []
        for i in range(math.ceil(len(input_)/batch_size)):
            if cnt+batch_size > len(input_):
                fix = get_fix(input_[cnt:len(input_)])
            else:
                fix = get_fix(input_[cnt:cnt+batch_size])
            cnt += batch_size
            fixes += fix
    

        for source, fix in zip(input_, fixes):
            program = apply_edits(source, fix.split())
            c.execute("INSERT OR IGNORE INTO iterations VALUES (?,?,?,?)",
                     (program_id, 1, data_name, fix))
    conn.commit()
conn.commit()
conn.close()

  0%|          | 0/93 [00:00<?, ?it/s]

_<directive>_#include _<include>_<stdio.h> _<type>_int _<id>_1@ _<op>_[ _<number>_# _<op>_] _<op>_= _<op>_{ _<number>_# _<op>_} _<op>_; _<type>_int _<id>_2@ _<op>_; _<keyword>_static _<type>_int _<id>_3@ _<op>_= _<number>_# _<op>_; _<type>_int _<id>_3@ _<keyword>_for _<id>_2@ _<id>_4@ _<op>_( _<type>_int _<id>_5@ _<op>_) _<op>_{ _<keyword>_for _<op>_( _<id>_6@ _<op>_= _<number>_# _<op>_; _<id>_6@ _<op>_< _<id>_2@ _<op>_; _<id>_6@ _<op>_++ _<op>_) _<op>_{ _<keyword>_if _<op>_( _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_> _<id>_3@ _<op>_&& _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_< _<op>_= _<id>_7@ _<op>_) _<id>_3@ _<op>_= _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_; _<op>_} _<APIcall>_printf _<op>_( _<string>_ _<op>_, _<id>_3@ _<op>_) _<op>_; _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_= _<number>_# _<op>_; _<op>_} _<type>_int _<APIcall>_main _<op>_( _<op>_) _<op>_{ _<type>_int _<id>_8@ _<op>_, _<id>_6@ _<op>_; _<APIcall>_scanf _<op>_( _<string>_ _<op>_, _<op>_& _<id>_2@ _<op>_, _<op>_& _<id>_8@ _

_<directive>_#include _<include>_<stdio.h> _<type>_int _<id>_1@ _<op>_[ _<number>_# _<op>_] _<op>_= _<op>_{ _<number>_# _<op>_} _<op>_; _<type>_int _<id>_2@ _<op>_; _<keyword>_static _<type>_int _<id>_3@ _<op>_= _<number>_# _<op>_; _<type>_int _<id>_3@ _<keyword>_for _<id>_2@ _<id>_4@ _<op>_( _<type>_int _<id>_5@ _<op>_) _<op>_{ _<keyword>_for _<op>_( _<id>_6@ _<op>_= _<number>_# _<op>_; _<id>_6@ _<op>_< _<id>_2@ _<op>_; _<id>_6@ _<op>_++ _<op>_) _<op>_{ _<keyword>_if _<op>_( _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_> _<id>_3@ _<op>_&& _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_< _<op>_= _<id>_7@ _<op>_) _<id>_3@ _<op>_= _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_; _<op>_} _<APIcall>_printf _<op>_( _<string>_ _<op>_, _<id>_3@ _<op>_) _<op>_; _<id>_1@ _<op>_[ _<id>_6@ _<op>_] _<op>_= _<number>_# _<op>_; _<op>_} _<type>_int _<APIcall>_main _<op>_( _<op>_) _<op>_{ _<type>_int _<id>_8@ _<op>_, _<id>_6@ _<op>_; _<APIcall>_scanf _<op>_( _<string>_ _<op>_, _<op>_& _<id>_2@ _<op>_, _<op>_& _<id>_8@ _




KeyboardInterrupt: 

In [21]:
for problem_id, test_programs in tqdm(test_dataset.items()):
    sequences_of_programs[problem_id] = {}
    fixes_suggested_by_network[problem_id] = {}

    entries = []
    
    for program, name_dict, name_sequence, user_id, program_id in test_programs:
        sequences_of_programs[problem_id][program_id] = [program]
        fixes_suggested_by_network[problem_id][program_id] = []
        entries.append(
            (program, name_dict, name_sequence, user_id, program_id,))

        c.execute("INSERT OR IGNORE INTO programs VALUES (?,?,?,?,?,?)", (program_id,
                  user_id, problem_id, program, json.dumps(name_dict), json.dumps(name_sequence)))
            
    for round_ in range(iteration):
        to_delete = []
        input_ = []
    
        for i, entry in enumerate(entries):
            _, _, _, _, program_id = entry

            if sequences_of_programs[problem_id][program_id][-1] is not None:
                tmp = sequences_of_programs[problem_id][program_id][-1]
                input_.append(remove_line_numbers(tmp))
            else:
                to_delete.append(i)
                
        to_delete = sorted(to_delete)[::-1]

        for i in to_delete:
            del entries[i]

        assert len(input_) == len(entries)

        if len(input_) == 0:
            #print('Stopping before iteration %d (no programs remain)' % (round_ + 1))
            break
            
        cnt = 0
        fixes = []
        for i in range(math.ceil(len(input_)/batch_size)):
            if cnt+batch_size > len(input_):
                fix = get_fix(input_[cnt:len(input_)])
            else:
                fix = get_fix(input_[cnt:cnt+batch_size])
            cnt += batch_size
            fixes += fix
            
        to_delete = []

        for source, fix in zip(input_, fixes):
            program = apply_edits(source, fix.split())
            sequences_of_programs[problem_id][program_id].append(" ".join(program))
        
            c.execute("INSERT OR IGNORE INTO iterations VALUES (?,?,?,?)",
                     (program_id, round_, data_name, fix))
                  
        to_delete = sorted(to_delete)[::-1]
        
        for i in to_delete:
            del entries[i]
            
    conn.commit()
conn.commit()
conn.close()

100%|██████████| 93/93 [04:10<00:00,  2.82s/it]


In [None]:
'''
for problem_id, test_programs in test_dataset.items():
    sequences_of_programs[problem_id] = {}
    fixes_suggested_by_network[problem_id] = {}

    entries = []
    
    for program, name_dict, name_sequence, user_id, program_id in test_programs:
        sequences_of_programs[problem_id][program_id] = [program]
        fixes_suggested_by_network[problem_id][program_id] = []
        entries.append(
            (program, name_dict, name_sequence, user_id, program_id,))

        c.execute("INSERT OR IGNORE INTO programs VALUES (?,?,?,?,?,?)", (program_id,
                  user_id, problem_id, program, json.dumps(name_dict), json.dumps(name_sequence)))
            
    for round_ in range(iteration):
        print(round_)
        to_delete = []
        input_ = []
        
        for i, entry in enumerate(entries):
            _, _, _, _, program_id = entry

            if sequences_of_programs[problem_id][program_id][-1] is not None:
                tmp = sequences_of_programs[problem_id][program_id][-1]
                input_.append(remove_line_numbers(tmp))
            else:
                to_delete.append(i)

        to_delete = sorted(to_delete)[::-1]

        for i in to_delete:
            del entries[i]

        assert len(input_) == len(entries)

        if len(input_) == 0:
            print('Stopping before iteration %d (no programs remain)' % (round_ + 1))
            break

        cnt = 0
        fixes = []
        for i in range(math.ceil(len(input_)/batch_size)):
            if cnt+batch_size > len(input_):
                fix = get_fix(input_[cnt:len(input_)])
            else:
                fix = get_fix(input_[cnt:cnt+batch_size])
            cnt += batch_size
            fixes += fix
            
        to_delete = []

        #print(len(fixes))
        #print(fixes)
    #break

        # Apply fixes
        for i, entry, fix in zip(range(len(fixes)), entries, fixes):
            _, _, _, _, program_id = entry

            try:
                program = apply_fix(sequences_of_programs[problem_id][program_id][-1], fix, fix_kind,
                                    flag_replace_ids=False)
                sequences_of_programs[problem_id][program_id].append(program)
            except ValueError as e:
                to_delete.append(i)
                sequences_of_programs[problem_id][program_id].append(
                    '{{localization_failed}}')
            except InvalidFixLocationException as e:
                to_delete.append(i)
                sequences_of_programs[problem_id][program_id].append(
                    '{{localization_failed}}')
            except SubstitutionFailedException as e:
                to_delete.append(i)
                sequences_of_programs[problem_id][program_id].append(
                    '{{back_substitution_failed}}')
            except Exception as e:
                raise e
            else:
                c.execute("INSERT OR IGNORE INTO iterations VALUES (?,?,?,?)",
                      (program_id, round_ + 1, data_name, fix))
                
        to_delete = sorted(to_delete)[::-1]
        
        for i in to_delete:
            del entries[i]

    conn.commit()

conn.commit()
conn.close()
'''

# Stop signal

In [None]:
def _is_stop_signal(fix):
    if _truncate_fix(fix) == '':
        return True

# Meets criteria

In [None]:
def meets_criterion(incorrect_program_tokens, fix, type_, silent=True):
    lines = get_lines(incorrect_program_tokens)
    fix = _truncate_fix(fix)

    if _is_stop_signal(fix):
        return False

    try:
        fix_line_number = extract_line_number(fix)
    except FailedToGetLineNumberException:
        return False

    if fix_line_number >= len(lines):
        return False

    fix_line = lines[fix_line_number]

    # Make sure number of IDs is the same
    if len(re.findall('_<id>_\w*', fix_line)) != len(re.findall('_<id>_\w*', fix)):
        if not silent:
            print('number of ids is not the same')
        return False

    keywords_regex = '_<keyword>_\w+|_<type>_\w+|_<APIcall>_\w+|_<include>_\w+'

    if type_ == 'replace' and re.findall(keywords_regex, fix_line) != re.findall(keywords_regex, fix):
        if not silent:
            print('important words (keywords, etc.) change drastically')
        return False

    return True

# Get final results

In [None]:
def get_final_results(database):
    with sqlite3.connect(database) as conn:
        c = conn.cursor()

        error_counts = []

        for row in c.execute("SELECT iteration, COUNT(*) FROM error_messages GROUP BY iteration ORDER BY iteration;"):
            error_counts.append(row[1])

        query1 = """SELECT COUNT(*)
        FROM error_messages
        WHERE iteration = 0 AND prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"""

        for row in c.execute(query1):
            initial_errors = row[0]

        query2 = """SELECT COUNT(*)
        FROM error_messages
        WHERE iteration = 10 AND prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"""

        for row in c.execute(query2):
            final_errors = row[0]

        query3 = """SELECT COUNT(DISTINCT prog_id)
        FROM error_message_strings
        WHERE iteration = 10 AND error_message_count = 0 and prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"""

        for row in c.execute(query3):
            fully_fixed = row[0]

        query4 = """SELECT DISTINCT prog_id, error_message_count FROM error_message_strings
        WHERE iteration = 0 AND error_message_count > 0 and prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"""

        query5 = """SELECT DISTINCT prog_id, error_message_count FROM error_message_strings
        WHERE iteration = 10 AND error_message_count > 0 and prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"""

        original_errors = {}
        for row in c.execute(query4):
            original_errors[row[0]] = int(row[1])

        partially_fixed = {}
        unfixed = {}
        for row in c.execute(query5):
            if int(row[1]) < original_errors[row[0]]:
                partially_fixed[row[0]] = int(row[1])
            elif int(row[1]) == original_errors[row[0]]:
                unfixed[row[0]] = int(row[1])
            else:
                print(row[0], row[1], original_errors[row[0]])

        token_counts = []
        assignments = None

        for row in c.execute("SELECT COUNT(DISTINCT prob_id) FROM programs p WHERE prog_id NOT IN (SELECT p.prog_id FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count = 0);"):
            assignments = int(row[0])

        for row in c.execute("SELECT code FROM programs p INNER JOIN error_message_strings e ON p.prog_id = e.prog_id WHERE e.iteration = 0 AND e.error_message_count <> 0;"):
            token_counts += [len(row[0].split())]

        avg_token_count = np.mean(token_counts)

        print("-------")
        print("Assignments:", assignments)
        print("Program count:", len(token_counts))
        print("Average token count:", avg_token_count)
        print("Error messages:", initial_errors)
        print("-------")

        print("Errors remaining:", final_errors)
        print("Reduction in errors:", (initial_errors - final_errors))
        print("Completely fixed programs:", fully_fixed)
        print("partially fixed programs:", len(partially_fixed))
        print("unfixed programs:", len(unfixed))
        print("-------")

In [None]:
def do_problem(problem_id):
    global reconstruction, errors, errors_full, total_count, errors_test

    c = conn.cursor()

    reconstruction[problem_id] = {}
    errors[problem_id] = {}
    errors_full[problem_id] = {}
    errors_test[problem_id] = []
    candidate_programs = []

    for row in c.execute('SELECT user_id, prog_id, code, name_dict, name_seq FROM programs WHERE prob_id = ?', (problem_id,)):
        user_id, prog_id, initial = row[0], row[1], row[2]
        name_dict = json.loads(row[3])
        name_seq = json.loads(row[4])

        candidate_programs.append(
            (user_id, prog_id, initial, name_dict, name_seq,))

    for _, prog_id, initial, name_dict, name_seq in candidate_programs:
        fixes_suggested_by_typo_network = []
        fixes_suggested_by_undeclared_network = []

        for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'typo\' ORDER BY iteration', (prog_id,)):
            fixes_suggested_by_typo_network.append(row[0])

        for row in c.execute('SELECT fix FROM iterations WHERE prog_id=? AND network = \'ids\' ORDER BY iteration', (prog_id,)):
            fixes_suggested_by_undeclared_network.append(row[0])

        reconstruction[problem_id][prog_id] = [initial]
        temp_errors, temp_errors_full = compilation_errors(
            tokens_to_source(initial, name_dict, False), database_path)
        errors[problem_id][prog_id] = [temp_errors]
        errors_full[problem_id][prog_id] = [temp_errors_full]

        try:
            for fix in fixes_suggested_by_typo_network:
                if meets_criterion(reconstruction[problem_id][prog_id][-1], fix, 'replace'):
                    temp_prog = apply_fix(
                        reconstruction[problem_id][prog_id][-1], fix, 'replace')
                    temp_errors, temp_errors_full = compilation_errors(
                        tokens_to_source(temp_prog, name_dict, False), database_path)

                    if len(temp_errors) > len(errors[problem_id][prog_id][-1]):
                        break
                    else:
                        reconstruction[problem_id][prog_id].append(temp_prog)
                        errors[problem_id][prog_id].append(temp_errors)
                        errors_full[problem_id][prog_id].append(
                            temp_errors_full)
                else:
                    break

        except InvalidFixLocationException:
            print('Localization failed')

        while len(reconstruction[problem_id][prog_id]) <= 5:
            reconstruction[problem_id][prog_id].append(
                reconstruction[problem_id][prog_id][-1])
            errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1])
            errors_full[problem_id][prog_id].append(
                errors_full[problem_id][prog_id][-1])

        already_fixed = []

        try:
            for fix in fixes_suggested_by_undeclared_network:
                if fix not in already_fixed:
                    temp_prog = apply_fix(
                        reconstruction[problem_id][prog_id][-1], fix, 'insert')
                    already_fixed.append(fix)
                    temp_errors, temp_errors_full = compilation_errors(
                        tokens_to_source(temp_prog, name_dict, False), database_path)

                    if len(temp_errors) > len(errors[problem_id][prog_id][-1]):
                        break
                    else:
                        reconstruction[problem_id][prog_id].append(temp_prog)
                        errors[problem_id][prog_id].append(temp_errors)
                        errors_full[problem_id][prog_id].append(
                            temp_errors_full)
                else:
                    pass

        except InvalidFixLocationException:
            print('Localization failed')

        while len(reconstruction[problem_id][prog_id]) <= 10:
            reconstruction[problem_id][prog_id].append(
                reconstruction[problem_id][prog_id][-1])
            errors[problem_id][prog_id].append(errors[problem_id][prog_id][-1])
            errors_full[problem_id][prog_id].append(
                errors_full[problem_id][prog_id][-1])

        errors_test[problem_id].append(errors[problem_id][prog_id])

        for k, errors_t, errors_full_t in zip(range(len(errors[problem_id][prog_id])), errors[problem_id][prog_id], errors_full[problem_id][prog_id]):
            c.execute("INSERT INTO error_message_strings VALUES(?, ?, ?, ?, ?)", (
                prog_id, k, 'typo', errors_full_t.decode('utf-8', 'ignore'), len(errors_t)))

            for error_ in errors_t:
                c.execute("INSERT INTO error_messages VALUES(?, ?, ?, ?)",
                            (prog_id, k, 'typo', error_.decode('utf-8', 'ignore'),))

    count_t = len(candidate_programs)
    total_count += count_t
    conn.commit()


    c.close()

In [None]:
def subset(arr1, arr2):
    for x in arr1:
        if x not in arr2:
            return False

    return True

In [None]:
conn = sqlite3.connect(database)
c = conn.cursor()

c.execute('''CREATE TABLE IF NOT EXISTS error_message_strings (
                prog_id text NOT NULL,
                iteration text NOT NULL,
                network text NOT NULL,
                error_message_string text NOT NULL,
                error_message_count integer NOT NULL,
                FOREIGN KEY(prog_id, iteration, network) REFERENCES iterations(prog_id, iteration, network)
             )''')

problem_ids = []

for row in c.execute('SELECT DISTINCT prob_id FROM programs'):
    problem_ids.append(row[0])

c.close()

reconstruction = {}
errors = {}
errors_full = {}
errors_test = {}

fixes_per_stage = [0] * 10

total_count = 0

In [None]:
start = time.time()

for problem_id in tqdm(problem_ids):
    do_problem(problem_id)

time_t = time.time() - start

conn.commit()
conn.close()

print('Total time:', time_t, 'seconds')
print('Total programs processed:', total_count)
print('Average time per program:', int(float(time_t) / float(total_count) * 1000), 'ms')

In [None]:
total_fixes_num = {}
errors_before = {}

for problem_id in errors_test:
    total_fixes_num[problem_id] = 0

    for j, seq in enumerate(errors_test[problem_id]):
        error_numbers = [len(x) for x in seq]
        skip = False

        for i in range(len(error_numbers) - 1):
            assert (not error_numbers[i + 1] > error_numbers[i])
            total_fixes_num[problem_id] += error_numbers[i] - \
                error_numbers[i + 1]

            if error_numbers[i] != error_numbers[i + 1]:
                fixes_per_stage[i] += error_numbers[i] - error_numbers[i + 1]

total_numerator = 0
total_denominator = 0

for problem_id in errors_test:
    total_numerator += total_fixes_num[problem_id]
    total_denominator += sum([len(x[0]) for x in errors_test[problem_id]])


print(int(float(total_numerator) * 100.0 / float(total_denominator)), '%')


for stage in range(len(fixes_per_stage)):
    print('Stage', stage, ':', fixes_per_stage[stage])

get_final_results(database)