In [None]:
import bleu
import weighted_ngram_match
import syntax_match
import dataflow_match
import pandas as pd
import os

lang = "python"
alpha,beta,gamma,theta = [0.1, 0.1, 0.4, 0.4]

for path, subdirs, files in os.walk("CodeContestsCleanCodeBLEU"):
    for filename in files:
        df = pd.read_csv(os.path.join(path, filename))
        scores = []
        for index, row in df.iterrows():
            if row["Solution Language"] != "PYTHON3":
                scores.append("")
                continue
            elif pd.isnull(row["Incorrect Solution"]):
                scores.append(1)
                continue
            else:
                # preprocess inputs
                reference_code = df[(df['Description'] == row["Description"]) & 
                                    (df["Correct Solution"].notnull()) &
                                    (df["Solution Language"] == "PYTHON3")]["Correct Solution"].tolist() 
                pre_references = [[reference] \
                                for reference in reference_code]
                hypothesis = [row["Incorrect Solution"]]

                #We will exclude incorrect solutions to problems with no correct solutions
                if len(reference_code) == 0:
                    scores.append("")
                    continue

                #We will take at most 15 reference solutions due to time constraints
                pre_references = pre_references[:15]    
                references = []
                for i in range(len(hypothesis)):
                    ref_for_instance = []
                    for j in range(len(pre_references)):
                        ref_for_instance.append(pre_references[j][i])
                    references.append(ref_for_instance)


                # calculate ngram match (BLEU)
                tokenized_hyps = [x.split() for x in hypothesis]
                tokenized_refs = [[x.split() for x in reference] for reference in references]

                ngram_match_score = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)

                # calculate weighted ngram match
                keywords = [x.strip() for x in open('keywords/'+lang+'.txt', 'r', encoding='utf-8').readlines()]
                def make_weights(reference_tokens, key_word_list):
                    return {token:1 if token in key_word_list else 0.2 \
                            for token in reference_tokens}
                tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
                            for reference_tokens in reference] for reference in tokenized_refs]
                weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)

                # calculate syntax match
                syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)

                # calculate dataflow match
                dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)

                code_bleu_score = alpha*ngram_match_score\
                                + beta*weighted_ngram_match_score\
                                + gamma*syntax_match_score\
                                + theta*dataflow_match_score

                scores.append(code_bleu_score)

        df["Score"] = scores
        df.to_csv(os.path.join(path, filename), index=False)