In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import bleu
import weighted_ngram_match
import syntax_match
import dataflow_match

In [3]:
def compute_metrics(targets, suggestions):
    # calculate ngram match (BLEU)
    tokenized_targets = [x.split() for x in targets]
    tokenized_suggestions = [[x.split() for x in sugg] for sugg in suggestions]

    bleu_ngram_match_score = bleu.corpus_bleu(tokenized_suggestions, tokenized_targets)

    # calculate weighted ngram match
    keywords = [x.strip() for x in open(
        f'data/java-keywords.txt', 'r', encoding='utf-8').readlines()]


    def make_weights(reference_tokens, key_word_list):
        return {token: 1 if token in key_word_list else 0.2
                for token in reference_tokens}


    tokenized_suggs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]
                                    for reference_tokens in sugg] for sugg in tokenized_suggestions]

    weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(
        tokenized_suggs_with_weights, tokenized_targets)
    
    # calculate syntax match
    syntax_match_score = syntax_match.corpus_syntax_match(
        suggestions, targets, 'java')

    # calculate dataflow match
    dataflow_match_score = dataflow_match.corpus_dataflow_match(
        suggestions, targets, 'java')
    
    # return all scores in a dict
    return {
        'bleu_ngram_match': bleu_ngram_match_score,
        'weighted_ngram_match': weighted_ngram_match_score,
        'syntax_match': syntax_match_score,
        'dataflow_match': dataflow_match_score
    }

In [4]:
"""
public POIFSFileSystem(){
    this.phase = true;
    _header.BATCount = 1;
    _header.BATArray = (new int[]){1 };
    BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
    bb.WorldBlockIndex=1;
    _bold_blocks.Add(bb);
    SetNextBlock(0, POIFSConstants.END_OF_CHAIN);
    SetNextBlock(1, POIFSConstants.FAT_SECTOR_BLOCK);
    _property_table.SetStartBlock(0);
}
"""

'\npublic POIFSFileSystem(){\n    this.phase = true;\n    _header.BATCount = 1;\n    _header.BATArray = (new int[]){1 };\n    BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);\n    bb.WorldBlockIndex=1;\n    _bold_blocks.Add(bb);\n    SetNextBlock(0, POIFSConstants.END_OF_CHAIN);\n    SetNextBlock(1, POIFSConstants.FAT_SECTOR_BLOCK);\n    _property_table.SetStartBlock(0);\n}\n'

# Correct suggestion

In [57]:
targets = ["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]

suggestions = [["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]]

compute_metrics(targets, suggestions)

{'bleu_ngram_match': 1.0,
 'weighted_ngram_match': 1.0,
 'syntax_match': 1.0,
 'dataflow_match': 1.0}

# Suggestion swap variables

In [58]:
targets = ["""BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);"""]

suggestions = [["""BATBlock bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);"""]]

compute_metrics(targets, suggestions)

{'bleu_ngram_match': 0.26591479484724945,
 'weighted_ngram_match': 0.26591479484724945,
 'syntax_match': 0.375,
 'dataflow_match': 1.0}

# Suggestion swap lines

In [59]:
targets = ["""
BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
_header.BATArray = (new int[]){1 };
"""]

suggestions = [["""
_header.BATArray = (new int[]){1 };
BATBlock bb = BATBlock.CreateEmptyBATBlock(false, bigBlockSize);
"""]]

compute_metrics(targets, suggestions)

{'bleu_ngram_match': 0.785629301801026,
 'weighted_ngram_match': 0.785629301801026,
 'syntax_match': 0.9473684210526315,
 'dataflow_match': 0.5}

# Wrong binary operator

In [60]:
targets = ["""
BATBlock bb = cc + DD;
"""]

suggestions = [["""
BATBlock bb = cc * DD;
"""]]

compute_metrics(targets, suggestions)

+
[<Node type=identifier, start_point=(0, 14), end_point=(0, 16)>, <Node type="+", start_point=(0, 17), end_point=(0, 18)>, <Node type=identifier, start_point=(0, 19), end_point=(0, 21)>]
*
[<Node type=identifier, start_point=(0, 14), end_point=(0, 16)>, <Node type="*", start_point=(0, 17), end_point=(0, 18)>, <Node type=identifier, start_point=(0, 19), end_point=(0, 21)>]


{'bleu_ngram_match': 0.537284965911771,
 'weighted_ngram_match': 0.537284965911771,
 'syntax_match': 0.8333333333333334,
 'dataflow_match': 1.0}

# Wrong type

In [61]:
targets = ["""
BATBlock bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);
"""]

suggestions = [["""
float bb = BATBlock.CreateEmptyBATBlock(bigBlockSize, false);
"""]]

compute_metrics(targets, suggestions)

{'bleu_ngram_match': 0.668740304976422,
 'weighted_ngram_match': 0.5773502691896258,
 'syntax_match': 0.6666666666666666,
 'dataflow_match': 1.0}

# Different naming but same origin

In [64]:
targets = ["""
float cc = 5;
float bb = cc - DD;
"""]

suggestions = [["""
float ii = 5;
float bb = ii - DD;
"""]]

compute_metrics(targets, suggestions)

{'bleu_ngram_match': 0.4671379777282001,
 'weighted_ngram_match': 0.47960593523654194,
 'syntax_match': 1.0,
 'dataflow_match': 1.0}