In [1]:
from transformers import AutoTokenizer
from constants import *
import pandas as pd
import evaluate
import os

In [2]:
RESULTS_FOLDER = "results"
REFERENCE_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "reference_code")
ACCEPTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "accepted_code")
REJECTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "rejected_code")
HF_TOKEN = open(".token", "r").read().strip()
BLEU = evaluate.load("bleu")
METEOR = evaluate.load("meteor")
TER = evaluate.load("ter")
ROUGE = evaluate.load('rouge')
CHRF = evaluate.load("chrf")
WER = evaluate.load("wer")

[nltk_data] Downloading package wordnet to /Users/caman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/caman/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/caman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
lines = open(os.path.join(RESULTS_FOLDER, "compilation_log.txt"), "r").readlines()

In [4]:
lines[:10]

["2024-10-15 23:13:07 - Compilation and syntax check failed: deepseekcoder7b-bit_ops-py-c.c: Command '['gcc', '/tmp/deepseekcoder7b-bit_ops-py-c.c', '-o', '/tmp/deepseekcoder7b-bit_ops-py-c.c.out']' returned non-zero exit status 1.\n",
 '2024-10-15 23:13:08 - Compilation successful: codegeex4-str_interp-c-java.java\n',
 '2024-10-15 23:13:09 - Compilation successful: yicoder2b-str_interp-cpp-go.go\n',
 '2024-10-15 23:13:11 - Compilation successful: codeqwen-int_arith-go-cpp.cpp\n',
 '2024-10-15 23:13:13 - Syntax check successful: magicoder7b-file_size-java-rs.rs\n',
 '2024-10-15 23:13:13 - Syntax check successful: codegemma7b-file_exists-c-py.py\n',
 '2024-10-15 23:13:13 - Compilation successful: codellama7b-int_factors-rs-c.c\n',
 "2024-10-15 23:13:13 - Compilation and syntax check failed: deepseekcoderv2-logic_ops-rs-c.c: Command '['gcc', '/tmp/deepseekcoderv2-logic_ops-rs-c.c', '-o', '/tmp/deepseekcoderv2-logic_ops-rs-c.c.out']' returned non-zero exit status 1.\n",
 "2024-10-15 23:13

In [5]:
def check_row_exists(df, m, t, fl, tl):
    return (
        (df["model"] == m)
        & (df["task"] == t)
        & (df["from_lang"] == fl)
        & (df["to_lang"] == tl)
    ).any()

In [6]:
models = []
tasks = []
from_langs = []
to_langs = []
successes = []
valids = []
bleus = []
meteors = []
ters = []
rouge1s = []
rouge2s = []
rougeLs = []
rougeLsums = []
chrfs = []
wers = []

if os.path.exists(os.path.join(RESULTS_FOLDER, f"metrics.csv")):
    metrics = pd.read_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"))
else:
    metrics = pd.DataFrame(
        {
            "model": models,
            "task": tasks,
            "from_lang": from_langs,
            "to_lang": to_langs,
            "success": successes,
            "valid": valids,
            "bleu": bleus,
            "meteor": meteors,
            "rouge1": rouge1s,
            "rouge2": rouge2s,
            "rougeL": rougeLs,
            "rougeLsum": rougeLsums,
            "chrF": chrfs,
            "ter": ters,
            "wer": wers,
        }
    )

cnt = 0
for line in lines:
    valid = 0
    if "successful" in line:
        valid = 1
    line = line.strip()
    line = [x.strip() for x in line.split(":")][3]
    line = line.split(".")[0]
    line = line.split("-")
    m = line[0]
    t = line[1]
    fl = line[2]
    tl = line[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    cnt += 1
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    valids.append(valid)
    # tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_TAGS[m], token=HF_TOKEN, trust_remote_code=True)
    src_code = (
        open(os.path.join(REFERENCE_CODE_FOLDER, f"{t}.{fl}"), "r").read().strip()
    )
    ref_code = (
        open(os.path.join(REFERENCE_CODE_FOLDER, f"{t}.{tl}"), "r").read().strip()
    )
    gen_code = (
        open(os.path.join(ACCEPTED_CODE_FOLDER, f"{m}-{t}-{fl}-{tl}.{tl}"), "r")
        .read()
        .strip()
    )
    print("accepted", m, t, fl, tl, "start ... ", end="")
    bleu = BLEU.compute(predictions=[gen_code], references=[[ref_code]])
    bleus.append(bleu["bleu"])
    meteor = METEOR.compute(predictions=[gen_code], references=[ref_code])
    meteors.append(meteor["meteor"])
    ter = TER.compute(
        predictions=[gen_code], references=[[ref_code]], case_sensitive=True
    )
    ters.append(ter["score"])
    rouge = ROUGE.compute(
        predictions=[gen_code], references=[[ref_code]], tokenizer=lambda x: x.split()
    )
    rouge1s.append(rouge["rouge1"])
    rouge2s.append(rouge["rouge2"])
    rougeLs.append(rouge["rougeL"])
    rougeLsums.append(rouge["rougeLsum"])
    chrf = CHRF.compute(predictions=[gen_code], references=[[ref_code]])
    chrfs.append(chrf["score"])
    wer = WER.compute(predictions=[gen_code], references=[ref_code])
    wers.append(wer)
    print("done")
    successes.append(1)
    if cnt % 10 == 0:
        pd.concat(
            [
                metrics,
                pd.DataFrame(
                    {
                        "model": models,
                        "task": tasks,
                        "from_lang": from_langs,
                        "to_lang": to_langs,
                        "success": successes,
                        "valid": valids,
                        "bleu": bleus,
                        "meteor": meteors,
                        "rouge1": rouge1s,
                        "rouge2": rouge2s,
                        "rougeL": rougeLs,
                        "rougeLsum": rougeLsums,
                        "chrF": chrfs,
                        "ter": ters,
                        "wer": wers,
                    }
                ),
            ]
        ).to_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"), index=False)

for filename in os.listdir(REJECTED_CODE_FOLDER):
    if filename.startswith("."):
        continue
    filename = filename.split(".")[0]
    filename = filename.split("-")
    m = filename[0]
    t = filename[1]
    fl = filename[2]
    tl = filename[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    successes.append(0)
    valids.append(0)
    bleus.append(0)
    meteors.append(0)
    ters.append(0)
    rouge1s.append(0)
    rouge2s.append(0)
    rougeLs.append(0)
    rougeLsums.append(0)
    chrfs.append(0)
    wers.append(0)
    print("rejected", m, t, fl, tl, "done")

metrics = pd.concat(
    [
        metrics,
        pd.DataFrame(
            {
                "model": models,
                "task": tasks,
                "from_lang": from_langs,
                "to_lang": to_langs,
                "success": successes,
                "valid": valids,
                "bleu": bleus,
                "meteor": meteors,
                "rouge1": rouge1s,
                "rouge2": rouge2s,
                "rougeL": rougeLs,
                "rougeLsum": rougeLsums,
                "chrF": chrfs,
                "ter": ters,
                "wer": wers,
            }
        ),
    ]
)

accepted deepseekcoder7b bit_ops py c start ... done
accepted codegeex4 str_interp c java start ... done
accepted yicoder2b str_interp cpp go start ... done
accepted codeqwen int_arith go cpp start ... done
accepted magicoder7b file_size java rs start ... done
accepted codegemma7b file_exists c py start ... done
accepted codellama7b int_factors rs c start ... done
accepted deepseekcoderv2 logic_ops rs c start ... done
accepted magicoder7b int_factors c cpp start ... done
accepted codellama7b int_cmp cpp rs start ... done
accepted granitecode3b file_size cpp js start ... done
accepted codegemma7b dir_make cpp c start ... done
accepted yicoder2b bit_ops js rs start ... done
accepted stablecode str_prepend js go start ... done
accepted granitecode3b str_match cpp c start ... done
accepted magicoder7b str_interp py c start ... done
accepted codeqwen logic_ops c cpp start ... done
accepted yicoder9b str_substring py c start ... done
accepted yicoder2b int_cmp c rs start ... done
accepted st

In [7]:
metrics.head()

Unnamed: 0,model,task,from_lang,to_lang,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
0,deepseekcoder7b,bit_ops,py,c,1.0,0.0,0.130275,0.362417,0.152866,0.038462,0.101911,0.152866,27.217825,198.0,2.020202
1,codegeex4,str_interp,c,java,1.0,1.0,0.260155,0.687822,0.342857,0.181818,0.342857,0.342857,42.624201,130.769231,1.416667
2,yicoder2b,str_interp,cpp,go,1.0,1.0,0.292182,0.648074,0.478873,0.318841,0.478873,0.478873,47.962884,116.0,1.304348
3,codeqwen,int_arith,go,cpp,1.0,1.0,0.239201,0.679556,0.386364,0.206107,0.378788,0.378788,46.138403,214.705882,2.179104
4,magicoder7b,file_size,java,rs,1.0,1.0,0.157098,0.25704,0.214286,0.073171,0.214286,0.214286,28.293514,87.931034,0.886792


In [8]:
_ = metrics.info()
metrics.to_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9282 entries, 0 to 9281
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   model      9282 non-null   object 
 1   task       9282 non-null   object 
 2   from_lang  9282 non-null   object 
 3   to_lang    9282 non-null   object 
 4   success    9282 non-null   float64
 5   valid      9282 non-null   float64
 6   bleu       9282 non-null   float64
 7   meteor     9282 non-null   float64
 8   rouge1     9282 non-null   float64
 9   rouge2     9282 non-null   float64
 10  rougeL     9282 non-null   float64
 11  rougeLsum  9282 non-null   float64
 12  chrF       9282 non-null   float64
 13  ter        9282 non-null   float64
 14  wer        9282 non-null   float64
dtypes: float64(11), object(4)
memory usage: 1.1+ MB


In [9]:
# per model metrics
col = "model"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)
m_df

Unnamed: 0,model,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
12,yicoder9b,1.0,0.866947,0.223868,0.435054,0.311906,0.150798,0.275394,0.309253,38.739161,218.587615,2.462086
0,codegeex4,1.0,0.841737,0.234469,0.441582,0.322318,0.162761,0.286929,0.320034,39.379258,220.404294,2.464054
9,magicoder7b,0.998599,0.781206,0.225907,0.432927,0.310678,0.151827,0.276961,0.308282,38.302735,206.600143,2.31429
3,codeqwen,1.0,0.754902,0.20951,0.436202,0.298902,0.134072,0.266471,0.296468,37.286248,207.46982,2.327207
4,deepseekcoder7b,0.980392,0.724286,0.221009,0.429473,0.303427,0.148951,0.269727,0.301161,38.55829,223.436358,2.512656
6,dolphincoder7b,0.990196,0.701556,0.212718,0.417652,0.301199,0.143359,0.269154,0.298563,36.822628,208.723302,2.320717
11,yicoder2b,0.991597,0.665254,0.192691,0.406645,0.283735,0.129536,0.250717,0.28124,35.32194,243.265737,2.702617
1,codegemma7b,1.0,0.656863,0.243224,0.434448,0.329321,0.16562,0.298748,0.327495,38.574027,162.596389,1.822218
2,codellama7b,0.970588,0.611833,0.215333,0.419963,0.308665,0.148436,0.278559,0.306514,36.832439,208.406585,2.34243
8,granitecode8b,0.998599,0.538569,0.210241,0.416304,0.31253,0.14853,0.2823,0.310525,36.483262,206.731569,2.322672


In [10]:
# per task metrics
col = "task"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)
m_df

Unnamed: 0,task,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
10,str_append,0.987179,0.821892,0.199438,0.439443,0.356025,0.196198,0.350127,0.353778,33.860568,118.42757,1.295794
7,int_cmp,0.983516,0.776536,0.275115,0.490284,0.446959,0.25944,0.388768,0.445621,42.42352,80.444837,0.847721
12,str_concat,0.990842,0.763401,0.279441,0.498657,0.423192,0.267051,0.403531,0.422484,41.706496,123.141065,1.289946
9,logic_ops,0.989011,0.746296,0.41043,0.658449,0.475428,0.288963,0.464148,0.474328,50.660842,95.345417,0.970324
13,str_interp,0.987179,0.74397,0.232533,0.502801,0.373045,0.204816,0.334063,0.368533,39.007435,133.444204,1.450887
15,str_prepend,0.985348,0.719331,0.144869,0.407348,0.312302,0.128368,0.304415,0.311644,26.604372,153.224296,1.685766
6,int_arith,0.957875,0.717017,0.248728,0.415308,0.336585,0.150587,0.29027,0.329955,38.423281,127.302119,1.321682
5,file_size,0.983516,0.646182,0.220027,0.455854,0.270298,0.118525,0.246261,0.265489,40.712398,263.983224,3.041595
4,file_rename,0.983516,0.620112,0.295953,0.517925,0.22314,0.105164,0.21401,0.222505,48.743157,348.901532,3.846216
2,file_create,0.990842,0.602588,0.210616,0.409765,0.253615,0.11447,0.233715,0.253128,39.194178,191.5013,2.068454


In [11]:
# per from_lang metrics
col = "from_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)
m_df

Unnamed: 0,from_lang,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
1,cpp,0.984917,0.720521,0.227823,0.433075,0.309594,0.154437,0.278159,0.307517,38.080204,221.000836,2.487923
0,c,0.971342,0.714286,0.208423,0.42283,0.295589,0.14237,0.264153,0.29375,37.213368,250.037757,2.830993
2,go,0.972851,0.689147,0.216506,0.43313,0.30793,0.14849,0.275233,0.305122,37.891185,186.745349,2.127374
6,rs,0.9819,0.65361,0.202328,0.405724,0.294465,0.139418,0.262953,0.292871,36.602273,210.97326,2.399558
3,java,0.982655,0.639294,0.203712,0.404514,0.301629,0.137063,0.269345,0.298803,35.197834,186.074165,2.033364
5,py,0.978884,0.585516,0.218384,0.423622,0.312428,0.145041,0.277657,0.309664,37.066486,209.734857,2.314078
4,js,0.982655,0.57406,0.229242,0.431747,0.314966,0.156847,0.281508,0.312613,38.150575,222.431416,2.484335


In [12]:
# per to_lang metrics
col = "to_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)
m_df

Unnamed: 0,to_lang,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
4,js,0.975113,0.898685,0.156153,0.37731,0.228731,0.087631,0.202344,0.227357,28.984546,218.567293,2.586429
5,py,0.984917,0.867534,0.144128,0.368417,0.229745,0.086449,0.20532,0.227595,32.48323,232.368575,2.851833
3,java,0.978884,0.697997,0.243718,0.455919,0.322688,0.173338,0.296772,0.321602,43.426323,296.738773,3.545309
6,rs,0.978884,0.671032,0.227672,0.428829,0.314457,0.132978,0.278444,0.311622,36.011025,150.523592,1.608785
2,go,0.987934,0.50687,0.216933,0.416319,0.33228,0.168139,0.298013,0.331269,38.870701,288.54067,2.883884
0,c,0.977376,0.469907,0.247358,0.433455,0.317286,0.159998,0.277203,0.314799,38.919148,145.464924,1.540031
1,cpp,0.972097,0.46315,0.271181,0.474902,0.392029,0.215627,0.351449,0.386682,41.516679,153.331873,1.646073


In [13]:
# per model and task specific metrics
col = ["model", "task"]
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics.csv"), index=False)
m_df

Unnamed: 0,model,task,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
213,yicoder9b,logic_ops,1.00000,1.000000,0.442493,0.673301,0.504908,0.332264,0.491952,0.503774,53.928554,97.411407,0.996248
214,yicoder9b,str_append,1.00000,1.000000,0.248547,0.474624,0.406289,0.247218,0.399316,0.403310,37.600978,103.157334,1.136932
61,codeqwen,str_append,1.00000,0.976190,0.208495,0.459449,0.375608,0.220934,0.370839,0.373210,35.017510,119.610296,1.321707
211,yicoder9b,int_cmp,1.00000,0.976190,0.263434,0.490502,0.447406,0.260802,0.381363,0.445771,42.527962,85.272332,0.901623
7,codegeex4,int_cmp,1.00000,0.976190,0.295002,0.520859,0.459643,0.272816,0.396834,0.457730,44.191166,81.466588,0.855530
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,stablecode,file_exists,1.00000,0.047619,0.181242,0.401812,0.194493,0.094395,0.168766,0.193552,36.367371,740.279077,10.597922
170,stablecode,bit_ops,0.97619,0.048780,0.225271,0.384865,0.270239,0.116365,0.224164,0.265450,31.810983,144.984246,1.477037
178,stablecode,int_factors,1.00000,0.047619,0.149513,0.312972,0.300082,0.100890,0.239409,0.298392,32.902936,136.197330,1.419097
184,stablecode,str_match,1.00000,0.047619,0.103582,0.265567,0.178524,0.044528,0.135386,0.176534,27.840106,161.421559,1.851936


In [14]:
# per from_lang and to_lang specific metrics
col = ["from_lang", "to_lang"]
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .mean()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics.csv"), index=False)
m_df

Unnamed: 0,from_lang,to_lang,success,valid,bleu,meteor,rouge1,rouge2,rougeL,rougeLsum,chrF,ter,wer
21,java,js,0.986425,0.908257,0.15658,0.359789,0.256794,0.106855,0.230314,0.25529,30.05566,202.997408,2.416978
9,cpp,js,0.977376,0.916667,0.177515,0.415809,0.237381,0.097034,0.216237,0.23518,31.615555,192.112533,2.285182
34,py,js,0.9819,0.889401,0.152798,0.381038,0.232029,0.083596,0.201983,0.231337,27.095766,227.554964,2.697936
10,cpp,py,0.99095,0.881279,0.151102,0.377062,0.232731,0.085739,0.200747,0.231169,33.027364,229.253918,2.810157
41,rs,py,0.99095,0.881279,0.115956,0.322971,0.210327,0.068688,0.188073,0.209188,29.426013,234.342706,2.907982
40,rs,js,0.972851,0.893023,0.149716,0.358994,0.20682,0.072519,0.18097,0.205889,28.453009,230.319733,2.782189
15,go,js,0.963801,0.901408,0.161129,0.397655,0.236578,0.09065,0.210047,0.23517,29.70675,189.037974,2.129372
3,c,js,0.968326,0.883178,0.139071,0.350671,0.202271,0.074827,0.174002,0.200763,26.968263,269.60051,3.208289
22,java,py,0.977376,0.875,0.139528,0.364674,0.229716,0.079519,0.212972,0.22634,31.478532,223.180011,2.626098
28,js,py,0.986425,0.862385,0.164392,0.397793,0.238914,0.098264,0.214581,0.237649,33.533558,238.918323,2.993423
