In [1]:
from transformers import AutoTokenizer
from constants import *
import pandas as pd
import evaluate
import os

In [2]:
RESULTS_FOLDER = "results"
REFERENCE_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "reference_code")
ACCEPTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "accepted_code")
REJECTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "rejected_code")
HF_TOKEN = open(".token", "r").read().strip()
BLEU = evaluate.load("bleu")

In [3]:
lines = open(os.path.join(RESULTS_FOLDER, "compilation_log.txt"), "r").readlines()

In [4]:
lines[:10]

["2024-09-06 16:55:50 - Compilation and syntax check failed: deepseekcoder7b-bit_ops-py-c.c: Command '['gcc', '/tmp/deepseekcoder7b-bit_ops-py-c.c', '-o', '/tmp/deepseekcoder7b-bit_ops-py-c.c.out']' returned non-zero exit status 1.\n",
 '2024-09-06 16:55:50 - Compilation successful: codellama7b-int_factors-rs-c.c\n',
 "2024-09-06 16:55:51 - Compilation and syntax check failed: magicoder7b-int_factors-c-cpp.cpp: Command '['g++', '/tmp/magicoder7b-int_factors-c-cpp.cpp', '-o', '/tmp/magicoder7b-int_factors-c-cpp.cpp.out']' returned non-zero exit status 1.\n",
 '2024-09-06 16:55:51 - Compilation successful: codellama7b-int_cmp-cpp-rs.rs\n',
 '2024-09-06 16:55:51 - Syntax check successful: codegeex4-bit_ops-py-js.js\n',
 '2024-09-06 16:55:51 - Syntax check successful: deepseekcoder7b-bit_ops-c-js.js\n',
 "2024-09-06 16:55:52 - Compilation and syntax check failed: granitecode3b-int_factors-java-cpp.cpp: Command '['g++', '/tmp/granitecode3b-int_factors-java-cpp.cpp', '-o', '/tmp/granitecode3

In [5]:
def check_row_exists(df, m, t, fl, tl):
    return (
        (df["model"] == m)
        & (df["task"] == t)
        & (df["from_lang"] == fl)
        & (df["to_lang"] == tl)
    ).any()

In [6]:
models = []
tasks = []
from_langs = []
to_langs = []
successes = []
valids = []
bleus = []

if os.path.exists(os.path.join(RESULTS_FOLDER, f"metrics.csv")):
    metrics = pd.read_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"))
else:
    metrics = pd.DataFrame(
        {
            "model": models,
            "task": tasks,
            "from_lang": from_langs,
            "to_lang": to_langs,
            "success": successes,
            "valid": valids,
            "bleu": bleus,
        }
    )

cnt = 0
for line in lines:
    valid = 0
    if "successful" in line:
        valid = 1
    line = line.strip()
    line = [x.strip() for x in line.split(":")][3]
    line = line.split(".")[0]
    line = line.split("-")
    m = line[0]
    t = line[1]
    fl = line[2]
    tl = line[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    cnt += 1
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    valids.append(valid)
    tokenizer = AutoTokenizer.from_pretrained(
        HUGGINGFACE_TAGS[m], token=HF_TOKEN, trust_remote_code=True
    )
    ref_code = (
        open(os.path.join(REFERENCE_CODE_FOLDER, f"{t}.{tl}"), "r").read().strip()
    )
    gen_code = (
        open(os.path.join(ACCEPTED_CODE_FOLDER, f"{m}-{t}-{fl}-{tl}.{tl}"), "r")
        .read()
        .strip()
    )
    print("accepted", m, t, fl, tl, "start")
    bleu = BLEU.compute(
        predictions=[gen_code], references=[[ref_code]], tokenizer=tokenizer.tokenize
    )
    bleus.append(bleu["bleu"])
    successes.append(1)
    print("accepted", m, t, fl, tl, "done")
    if cnt % 10 == 0:
        pd.DataFrame(
            {
                "model": models,
                "task": tasks,
                "from_lang": from_langs,
                "to_lang": to_langs,
                "success": successes,
                "valid": valids,
                "bleu": bleus,
            }
        ).to_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"), index=False)

for filename in os.listdir(REJECTED_CODE_FOLDER):
    if filename.startswith("."):
        continue
    filename = filename.split(".")[0]
    filename = filename.split("-")
    m = filename[0]
    t = filename[1]
    fl = filename[2]
    tl = filename[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    successes.append(0)
    valids.append(0)
    bleus.append(0)
    print("rejected", m, t, fl, tl, "done")

metrics = pd.concat(
    [
        metrics,
        pd.DataFrame(
            {
                "model": models,
                "task": tasks,
                "from_lang": from_langs,
                "to_lang": to_langs,
                "success": successes,
                "valid": valids,
                "bleu": bleus,
            }
        ),
    ]
)

In [7]:
metrics.head()

Unnamed: 0,model,task,from_lang,to_lang,success,valid,bleu
0,deepseekcoder7b,bit_ops,py,c,1.0,0.0,0.082299
1,codellama7b,int_factors,rs,c,1.0,1.0,0.004715
2,magicoder7b,int_factors,c,cpp,1.0,0.0,0.28329
3,codellama7b,int_cmp,cpp,rs,1.0,1.0,0.407748
4,codegeex4,bit_ops,py,js,1.0,1.0,0.012696


In [8]:
_ = metrics.info()
metrics.to_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8754 entries, 0 to 8753
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   model      8754 non-null   object 
 1   task       8754 non-null   object 
 2   from_lang  8754 non-null   object 
 3   to_lang    8754 non-null   object 
 4   success    8754 non-null   float64
 5   valid      8754 non-null   float64
 6   bleu       8754 non-null   float64
dtypes: float64(3), object(4)
memory usage: 478.9+ KB


In [9]:
# per model metrics
col = "model"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

              model  mean_valid  mean_bleu  mean_success
12        yicoder9b    0.866947   0.220156      1.000000
0         codegeex4    0.841737   0.153945      1.000000
9       magicoder7b    0.781206   0.199551      0.998599
3          codeqwen    0.754902   0.182459      1.000000
4   deepseekcoder7b    0.721212   0.190979      0.982143
6    dolphincoder7b    0.701556   0.168724      0.990196
11        yicoder2b    0.663380   0.187130      0.990237
1       codegemma7b    0.656863   0.202072      1.000000
2       codellama7b    0.611833   0.189402      0.970588
8     granitecode8b    0.538569   0.162156      0.998599
5   deepseekcoderv2    0.607744   0.154792      0.815934
7     granitecode3b    0.493653   0.166008      0.991608
10       stablecode    0.230661   0.166756      0.995798


In [10]:
# per task metrics
col = "task"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

             task  mean_valid  mean_bleu  mean_success
10     str_append    0.804781   0.200246      0.986248
7         int_cmp    0.776536   0.262073      0.983516
12     str_concat    0.755511   0.265137      0.990079
13     str_interp    0.744467   0.217697      0.986111
9       logic_ops    0.732932   0.324935      0.988095
15    str_prepend    0.702213   0.101771      0.982213
6       int_arith    0.717017   0.192141      0.956124
5       file_size    0.635815   0.174800      0.986111
4     file_rename    0.614000   0.231122      0.990099
3     file_exists    0.606426   0.143890      0.988095
2     file_create    0.598802   0.171282      0.992079
14      str_match    0.583162   0.091603      0.964356
1        dir_make    0.570565   0.155899      0.982178
8     int_factors    0.567164   0.138560      0.981685
0         bit_ops    0.546139   0.158474      0.972527
11        str_cmp    0.540541   0.090310      0.954365
16  str_substring    0.511202   0.131050      0.962745


In [11]:
# per from_lang metrics
col = "from_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

  from_lang  mean_valid  mean_bleu  mean_success
1       cpp    0.716029   0.190982      0.984776
0         c    0.715581   0.172424      0.971955
2        go    0.688578   0.181087      0.974380
6        rs    0.645240   0.171472      0.979283
3      java    0.627737   0.172558      0.984038
5        py    0.575634   0.182346      0.977618
4        js    0.568052   0.189018      0.981600


In [12]:
# per to_lang metrics
col = "to_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

  to_lang  mean_valid  mean_bleu  mean_success
4      js    0.895731   0.112481      0.975180
5      py    0.860976   0.118030      0.983213
3    java    0.698608   0.193464      0.977582
6      rs    0.666939   0.167987      0.978383
2      go    0.493917   0.213701      0.987190
1     cpp    0.461979   0.236809      0.973726
0       c    0.459150   0.217237      0.978417


In [13]:
# per model and task specific metrics
col = ["model", "task"]
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df.head(10))
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics.csv"), index=False)

           model        task  mean_valid  mean_bleu  mean_success
200    yicoder9b   logic_ops    1.000000   0.385792           1.0
201    yicoder9b  str_append    1.000000   0.284128           1.0
203    yicoder9b  str_concat    0.976190   0.313491           1.0
198    yicoder9b     int_cmp    0.976190   0.285022           1.0
7      codegeex4     int_cmp    0.976190   0.232491           1.0
61      codeqwen  str_append    0.976190   0.219866           1.0
149  magicoder7b   logic_ops    0.952381   0.369936           1.0
9      codegeex4   logic_ops    0.952381   0.331193           1.0
24   codegemma7b     int_cmp    0.952381   0.299689           1.0
63      codeqwen  str_concat    0.952381   0.293744           1.0


In [14]:
# per from_lang and to_lang specific metrics
col = ["from_lang", "to_lang"]
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col)
m_df["rank"] = m_df["mean_valid"] * m_df["mean_success"]
m_df = m_df.sort_values(["rank", "mean_bleu"], ascending=False)
m_df = m_df.drop("rank", axis=1)
print(m_df.head(10))
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics.csv"), index=False)

   from_lang to_lang  mean_valid  mean_bleu  mean_success
9        cpp      js    0.911765   0.125750      0.980769
21      java      js    0.902439   0.123055      0.985577
15        go      js    0.905473   0.113780      0.966346
10       cpp      py    0.878641   0.125636      0.990385
34        py      js    0.882353   0.114234      0.980769
40        rs      js    0.886700   0.096808      0.971292
41        rs      py    0.873786   0.090171      0.985646
3          c      js    0.885572   0.100980      0.966346
28        js      py    0.858537   0.137138      0.985577
22      java      py    0.858537   0.112728      0.976190
