In [1]:
from transformers import AutoTokenizer
from constants import *
import pandas as pd
import evaluate
import os

In [2]:
RESULTS_FOLDER = "results"
ACCEPTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "accepted_code")
REJECTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "rejected_code")
HF_TOKEN = open(".token", "r").read().strip()
BLEU = evaluate.load("bleu")

In [3]:
lines = open(os.path.join(RESULTS_FOLDER, "compilation_log.txt"), "r").readlines()

In [4]:
lines[:10]

["2024-09-06 16:55:50 - Compilation and syntax check failed: deepseekcoder7b-bit_ops-py-c.c: Command '['gcc', '/tmp/deepseekcoder7b-bit_ops-py-c.c', '-o', '/tmp/deepseekcoder7b-bit_ops-py-c.c.out']' returned non-zero exit status 1.\n",
 '2024-09-06 16:55:50 - Compilation successful: codellama7b-int_factors-rs-c.c\n',
 "2024-09-06 16:55:51 - Compilation and syntax check failed: magicoder7b-int_factors-c-cpp.cpp: Command '['g++', '/tmp/magicoder7b-int_factors-c-cpp.cpp', '-o', '/tmp/magicoder7b-int_factors-c-cpp.cpp.out']' returned non-zero exit status 1.\n",
 '2024-09-06 16:55:51 - Compilation successful: codellama7b-int_cmp-cpp-rs.rs\n',
 '2024-09-06 16:55:51 - Syntax check successful: codegeex4-bit_ops-py-js.js\n',
 '2024-09-06 16:55:51 - Syntax check successful: deepseekcoder7b-bit_ops-c-js.js\n',
 "2024-09-06 16:55:52 - Compilation and syntax check failed: granitecode3b-int_factors-java-cpp.cpp: Command '['g++', '/tmp/granitecode3b-int_factors-java-cpp.cpp', '-o', '/tmp/granitecode3

In [5]:
def check_row_exists(df, m, t, fl, tl):
    return (
        (df["model"] == m)
        & (df["task"] == t)
        & (df["from_lang"] == fl)
        & (df["to_lang"] == tl)
    ).any()

In [6]:
models = []
tasks = []
from_langs = []
to_langs = []
successes = []
valids = []
bleus = []

if os.path.exists(os.path.join(RESULTS_FOLDER, f"metrics.csv")):
    metrics = pd.read_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"))
else:
    metrics = pd.DataFrame(
        {
            "model": models,
            "task": tasks,
            "from_lang": from_langs,
            "to_lang": to_langs,
            "success": successes,
            "valid": valids,
            "bleu": bleus,
        }
    )

for line in lines:
    valid = 0
    if "successful" in line:
        valid = 1
    line = line.strip()
    line = [x.strip() for x in line.split(":")][3]
    line = line.split(".")[0]
    line = line.split("-")
    m = line[0]
    t = line[1]
    fl = line[2]
    tl = line[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    valids.append(valid)
    tokenizer = AutoTokenizer.from_pretrained(
        HUGGINGFACE_TAGS[m], token=HF_TOKEN, trust_remote_code=True
    )
    ref_code = REFERENCE_CODE[tl.upper()][t].strip()
    gen_code = (
        open(os.path.join(ACCEPTED_CODE_FOLDER, f"{m}-{t}-{fl}-{tl}.{tl}"), "r")
        .read()
        .strip()
    )
    bleu = BLEU.compute(
        predictions=[gen_code], references=[[ref_code]], tokenizer=tokenizer.tokenize
    )
    bleus.append(bleu["bleu"])
    successes.append(1)

for filename in os.listdir(REJECTED_CODE_FOLDER):
    if filename.startswith("."):
        continue
    filename = filename.split(".")[0]
    filename = filename.split("-")
    m = filename[0]
    t = filename[1]
    fl = filename[2]
    tl = filename[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    successes.append(0)
    valids.append(0)
    bleus.append(0)

metrics = pd.concat(
    [
        metrics,
        pd.DataFrame(
            {
                "model": models,
                "task": tasks,
                "from_lang": from_langs,
                "to_lang": to_langs,
                "success": successes,
                "valid": valids,
                "bleu": bleus,
            }
        ),
    ]
)

tokenizer_config.json:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

In [7]:
metrics.head()

Unnamed: 0,model,task,from_lang,to_lang,success,valid,bleu
0,deepseekcoder7b,bit_ops,py,c,1.0,0.0,0.063062
1,codellama7b,int_factors,rs,c,1.0,1.0,0.003572
2,magicoder7b,int_factors,c,cpp,1.0,0.0,0.240128
3,codellama7b,int_cmp,cpp,rs,1.0,1.0,0.3544
4,codegeex4,bit_ops,py,js,1.0,1.0,0.0


In [8]:
metrics.info()
metrics.to_csv(os.path.join(RESULTS_FOLDER, f"metrics.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 1849 entries, 0 to 167
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   model      1849 non-null   object 
 1   task       1849 non-null   object 
 2   from_lang  1849 non-null   object 
 3   to_lang    1849 non-null   object 
 4   success    1849 non-null   float64
 5   valid      1849 non-null   float64
 6   bleu       1849 non-null   float64
dtypes: float64(3), object(4)
memory usage: 115.6+ KB


In [9]:
# per model metrics
col = "model"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col).sort_values("mean_valid", ascending=False)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

              model  mean_valid  mean_bleu  mean_success
0         codegeex4    0.869048   0.133468      1.000000
1       codegemma7b    0.809524   0.195494      1.000000
9       magicoder7b    0.784431   0.179164      0.994048
4   deepseekcoder7b    0.721212   0.164208      0.982143
6    dolphincoder7b    0.690909   0.133625      0.982143
3          codeqwen    0.684524   0.155836      1.000000
5   deepseekcoderv2    0.677419   0.165617      0.733728
2       codellama7b    0.638037   0.180533      0.970238
8     granitecode8b    0.541667   0.151745      1.000000
7     granitecode3b    0.488095   0.152780      1.000000
10       stablecode    0.113772   0.155536      0.994048


In [10]:
# per task metrics
col = "task"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col).sort_values("mean_valid", ascending=False)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

          task  mean_valid  mean_bleu  mean_success
2      int_cmp    0.759382   0.238881      0.978402
1    int_arith    0.706150   0.150045      0.950216
3  int_factors    0.544248   0.109373      0.978355
0      bit_ops    0.539150   0.143372      0.967532


In [11]:
# per from_lang metrics
col = "from_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col).sort_values("mean_valid", ascending=False)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

  from_lang  mean_valid  mean_bleu  mean_success
1       cpp    0.694981   0.178425      0.977358
0         c    0.687747   0.174694      0.958333
3      java    0.651341   0.150157      0.988636
6        rs    0.647287   0.165705      0.977273
2        go    0.646825   0.123982      0.954545
4        js    0.606178   0.198089      0.981061
5        py    0.522088   0.131348      0.943182


In [12]:
# per to_lang metrics
col = "to_lang"
m1_df = (
    metrics[metrics["success"] > 0]
    .groupby(col)[["valid", "bleu"]]
    .mean()
    .reset_index()
    .rename(columns={"valid": "mean_valid", "bleu": "mean_bleu"})
)
m2_df = (
    metrics.groupby(col)
    .success.mean()
    .reset_index()
    .rename(columns={"success": "mean_success"})
)
m_df = m1_df.merge(m2_df, how="left", on=col).sort_values("mean_valid", ascending=False)
print(m_df)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics.csv"), index=False)

  to_lang  mean_valid  mean_bleu  mean_success
4      js    0.858824   0.091553      0.965909
5      py    0.829457   0.068437      0.977273
3    java    0.682353   0.198048      0.965909
6      rs    0.664062   0.199355      0.969697
1     cpp    0.498008   0.207890      0.950758
0       c    0.494163   0.222929      0.969811
2      go    0.432432   0.137427      0.981061
