In [27]:
%reload_ext autoreload
%autoreload 2

In [28]:
import bleu
import weighted_ngram_match
import syntax_match
import dataflow_match

In [29]:
def compute_metrics(targets, suggestions):
    # calculate ngram match (BLEU)
    tokenized_targets = [x.split() for x in targets]
    tokenized_suggestions = [[x.split() for x in sugg] for sugg in suggestions]

    bleu_ngram_match_score = bleu.corpus_bleu(tokenized_suggestions, tokenized_targets)

    # calculate weighted ngram match
    keywords = [x.strip() for x in open(
        f'data/java-keywords.txt', 'r', encoding='utf-8').readlines()]


    def make_weights(reference_tokens, key_word_list):
        return {token: 1 if token in key_word_list else 0.2
                for token in reference_tokens}


    tokenized_suggs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]
                                    for reference_tokens in sugg] for sugg in tokenized_suggestions]

    weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(
        tokenized_suggs_with_weights, tokenized_targets)
    
    # calculate syntax match
    syntax_match_score = syntax_match.corpus_syntax_match(
        suggestions, targets, 'java')

    # calculate dataflow match
    dataflow_match_score = dataflow_match.corpus_dataflow_match(
        suggestions, targets, 'java')
    
    # return all scores in a dict
    return {
        # 'bleu_ngram_match': bleu_ngram_match_score,
        # 'weighted_ngram_match': weighted_ngram_match_score,
        'syntax_match': syntax_match_score,
        'dataflow_match': dataflow_match_score
    }

In [30]:
"""
public POIFSFileSystem(){
    this.phase = true;
    _header.BATCount = 1;
    _header.BATArray = (new int[]){1 };
    BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
    bb.WorldBlockIndex=1;
    _bold_blocks.Add(bb);
    SetNextBlock(0, POIFSConstants.END_OF_CHAIN);
    SetNextBlock(1, POIFSConstants.FAT_SECTOR_BLOCK);
    _property_table.SetStartBlock(0);
}
"""

'\npublic POIFSFileSystem(){\n    this.phase = true;\n    _header.BATCount = 1;\n    _header.BATArray = (new int[]){1 };\n    BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);\n    bb.WorldBlockIndex=1;\n    _bold_blocks.Add(bb);\n    SetNextBlock(0, POIFSConstants.END_OF_CHAIN);\n    SetNextBlock(1, POIFSConstants.FAT_SECTOR_BLOCK);\n    _property_table.SetStartBlock(0);\n}\n'

# Correct suggestion

In [31]:
targets = ["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]

suggestions = [["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]]

compute_metrics(targets, suggestions)

<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 23), end_point=(0, 42)>
<Node type="(", start_point=(0, 42), end_point=(0, 43)>
<Node type=false, start_point=(0, 43), end_point=(0, 48)>
<Node type=identifier, start_point=(0, 50), end_point=(0, 62)>
<Node type=")", start_point=(0, 62), end_point=(0, 63)>
<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 23), end_point=(0, 42)>
<Node type="(", start_point=(0, 42), end_point=(0, 43)>
<Node type=false, start_point=(0, 43), end_point=(0, 48)>
<Node type=identifier, start_poin

{'syntax_match': 1.0, 'dataflow_match': 1.0}

# Suggestion swap variables

In [32]:
targets = ["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]

suggestions = [["""BATBlock bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);"""]]

compute_metrics(targets, suggestions)

<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 23), end_point=(0, 42)>
<Node type="(", start_point=(0, 42), end_point=(0, 43)>
<Node type=false, start_point=(0, 43), end_point=(0, 48)>
<Node type=identifier, start_point=(0, 50), end_point=(0, 62)>
<Node type=")", start_point=(0, 62), end_point=(0, 63)>
<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 23), end_point=(0, 42)>
<Node type="(", start_point=(0, 42), end_point=(0, 43)>
<Node type=identifier, start_point=(0, 43), end_point=(0, 55)>
<Node type=false, start_poin

{'syntax_match': 0.375, 'dataflow_match': 1.0}

# Suggestion swap lines

In [33]:
targets = ["""
BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
_header.BATArray = (new int[]){1 };
"""]

suggestions = [["""
_header.BATArray = (new int[]){1 };
BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
"""]]

compute_metrics(targets, suggestions)

<Node type=";", start_point=(1, 34), end_point=(1, 35)>
<Node type="(", start_point=(1, 19), end_point=(1, 20)>
<Node type=")", start_point=(1, 34), end_point=(1, 34)>
<Node type="new", start_point=(1, 20), end_point=(1, 23)>
<Node type="{", start_point=(1, 30), end_point=(1, 31)>
<Node type=decimal_integer_literal, start_point=(1, 31), end_point=(1, 32)>
<Node type="}", start_point=(1, 33), end_point=(1, 34)>
<Node type=")", start_point=(1, 29), end_point=(1, 30)>
<Node type="[", start_point=(1, 27), end_point=(1, 28)>
<Node type="]", start_point=(1, 28), end_point=(1, 29)>
<Node type="int", start_point=(1, 24), end_point=(1, 27)>
<Node type=identifier, start_point=(1, 0), end_point=(1, 7)>
<Node type=identifier, start_point=(1, 8), end_point=(1, 16)>
<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point

{'syntax_match': 0.9473684210526315, 'dataflow_match': 0.5}

# Wrong binary operator

In [34]:
targets = ["""
BATBlock bb = cc + DD;
"""]

suggestions = [["""
BATBlock bb = cc * DD;
"""]]

compute_metrics(targets, suggestions)

<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 21), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 16)>
<Node type=identifier, start_point=(0, 19), end_point=(0, 21)>
<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 21), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 16)>
<Node type=identifier, start_point=(0, 19), end_point=(0, 21)>


{'syntax_match': 0.8333333333333334, 'dataflow_match': 1.0}

# Wrong type

In [35]:
targets = ["""
BATBlock bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);
"""]

suggestions = [["""
float bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);
"""]]

compute_metrics(targets, suggestions)

<Node type=type_identifier, start_point=(0, 0), end_point=(0, 8)>
<Node type=";", start_point=(0, 63), end_point=(0, 64)>
<Node type=identifier, start_point=(0, 9), end_point=(0, 11)>
<Node type=identifier, start_point=(0, 14), end_point=(0, 22)>
<Node type=identifier, start_point=(0, 23), end_point=(0, 42)>
<Node type="(", start_point=(0, 42), end_point=(0, 43)>
<Node type=identifier, start_point=(0, 43), end_point=(0, 55)>
<Node type=false, start_point=(0, 57), end_point=(0, 62)>
<Node type=")", start_point=(0, 62), end_point=(0, 63)>
<Node type=";", start_point=(0, 60), end_point=(0, 61)>
<Node type=identifier, start_point=(0, 6), end_point=(0, 8)>
<Node type=identifier, start_point=(0, 11), end_point=(0, 19)>
<Node type=identifier, start_point=(0, 20), end_point=(0, 39)>
<Node type="(", start_point=(0, 39), end_point=(0, 40)>
<Node type=identifier, start_point=(0, 40), end_point=(0, 52)>
<Node type=false, start_point=(0, 54), end_point=(0, 59)>
<Node type=")", start_point=(0, 59), 

{'syntax_match': 0.6666666666666666, 'dataflow_match': 1.0}

# Different naming but same origin

In [36]:
targets = ["""
float cc = 5;
float bb = cc - DD;
"""]

suggestions = [["""
float ii = 5;
float bb = ii - DD;
"""]]

compute_metrics(targets, suggestions)

<Node type=";", start_point=(1, 18), end_point=(1, 19)>
<Node type=identifier, start_point=(1, 6), end_point=(1, 8)>
<Node type=identifier, start_point=(1, 11), end_point=(1, 13)>
<Node type=identifier, start_point=(1, 16), end_point=(1, 18)>
<Node type="float", start_point=(1, 0), end_point=(1, 5)>
<Node type=";", start_point=(0, 12), end_point=(0, 13)>
<Node type=identifier, start_point=(0, 6), end_point=(0, 8)>
<Node type=decimal_integer_literal, start_point=(0, 11), end_point=(0, 12)>
<Node type="float", start_point=(0, 0), end_point=(0, 5)>
<Node type=";", start_point=(1, 18), end_point=(1, 19)>
<Node type=identifier, start_point=(1, 6), end_point=(1, 8)>
<Node type=identifier, start_point=(1, 11), end_point=(1, 13)>
<Node type=identifier, start_point=(1, 16), end_point=(1, 18)>
<Node type="float", start_point=(1, 0), end_point=(1, 5)>
<Node type=";", start_point=(0, 12), end_point=(0, 13)>
<Node type=identifier, start_point=(0, 6), end_point=(0, 8)>
<Node type=decimal_integer_lit

{'syntax_match': 1.0, 'dataflow_match': 1.0}