In [1]:
from transformers import AutoTokenizer
from constants import *
import pandas as pd
import os

In [2]:
RESULTS_FOLDER = "results"
ACCEPTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "accepted_followup")
REJECTED_CODE_FOLDER = os.path.join(RESULTS_FOLDER, "rejected_followup")
HF_TOKEN = open(".token", "r").read().strip()

In [3]:
lines = open(os.path.join(RESULTS_FOLDER, "compilation_log_followup.txt"), "r").readlines()

In [4]:
lines[:10]

['2025-01-20 12:20:16 - Compilation successful: deepseekcoder7b-bit_ops-py-c.c\n',
 '2025-01-20 12:20:16 - Compilation and Syntax check failed: qwen2.5coder2b-file_exists-cpp-java.java\n',
 '2025-01-20 12:20:16 - Compilation failed: magicoder7b-file_size-java-rs.rs\n',
 '2025-01-20 12:20:17 - Compilation and Syntax check failed: magicoder7b-int_factors-c-cpp.cpp\n',
 '2025-01-20 12:20:17 - Compilation successful: codegemma7b-dir_make-cpp-c.c\n',
 '2025-01-20 12:20:17 - Compilation and Syntax check failed: yicoder2b-bit_ops-js-rs.rs\n',
 '2025-01-20 12:20:17 - Compilation and Syntax check failed: granitecode3b-str_match-cpp-c.c\n',
 '2025-01-20 12:20:17 - Compilation and Syntax check failed: stablecode-file_size-c-rs.rs\n',
 '2025-01-20 12:20:18 - Compilation and Syntax check failed: yicoder9b-bit_ops-py-go.go\n',
 '2025-01-20 12:20:18 - Compilation and Syntax check failed: yicoder2b-int_arith-go-rs.rs\n']

In [5]:
def check_row_exists(df, m, t, fl, tl):
    return (
        (df["model"] == m)
        & (df["task"] == t)
        & (df["from_lang"] == fl)
        & (df["to_lang"] == tl)
    ).any()

In [6]:
LANG_MAP = {
    "c": "c",
    "cpp": "cpp",
    "go": "go",
    "java": "java",
    "js": "javascript",
    "py": "python",
    "rs": "rust",
}

In [7]:
models = []
tasks = []
from_langs = []
to_langs = []
successes = []
valids = []

if os.path.exists(os.path.join(RESULTS_FOLDER, f"metrics_followup.csv")):
    metrics = pd.read_csv(os.path.join(RESULTS_FOLDER, f"metrics_followup.csv"))
else:
    metrics = pd.DataFrame(
        {
            "model": models,
            "task": tasks,
            "from_lang": from_langs,
            "to_lang": to_langs,
            "success": successes,
            "valid": valids,
        }
    )

cnt = 0
for line in lines:
    valid = 0
    if "successful" in line:
        valid = 1
    line = line.strip()
    line = [x.strip() for x in line.split(":")][3]
    line = line[: line.rfind(".")]
    line = line.split("-")
    m = line[0]
    t = line[1]
    fl = line[2]
    tl = line[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    cnt += 1
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    valids.append(valid)
    successes.append(1)
    print("done")
    if cnt % 10 == 0:
        pd.concat(
            [
                metrics,
                pd.DataFrame(
                    {
                        "model": models,
                        "task": tasks,
                        "from_lang": from_langs,
                        "to_lang": to_langs,
                        "success": successes,
                        "valid": valids,
                    }
                ),
            ]
        ).to_csv(os.path.join(RESULTS_FOLDER, f"metrics_followup.csv"), index=False)

for filename in os.listdir(REJECTED_CODE_FOLDER):
    if filename.startswith("."):
        continue
    filename = filename[:filename.rfind(".")]
    filename = filename.split("-")
    m = filename[0]
    t = filename[1]
    fl = filename[2]
    tl = filename[3]
    if check_row_exists(metrics, m, t, fl, tl):
        continue
    models.append(m)
    tasks.append(t)
    from_langs.append(fl)
    to_langs.append(tl)
    successes.append(0)
    valids.append(0)
    print("rejected", m, t, fl, tl, "done")

metrics = pd.concat(
    [
        metrics,
        pd.DataFrame(
            {
                "model": models,
                "task": tasks,
                "from_lang": from_langs,
                "to_lang": to_langs,
                "success": successes,
                "valid": valids,
            }
        ),
    ]
)

In [8]:
metrics.head()

Unnamed: 0,model,task,from_lang,to_lang,success,valid
0,deepseekcoder7b,bit_ops,py,c,1.0,1.0
1,qwen2.5coder2b,file_exists,cpp,java,1.0,0.0
2,magicoder7b,file_size,java,rs,1.0,0.0
3,magicoder7b,int_factors,c,cpp,1.0,0.0
4,codegemma7b,dir_make,cpp,c,1.0,1.0


In [9]:
_ = metrics.info()
metrics.to_csv(os.path.join(RESULTS_FOLDER, f"metrics_followup.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3963 entries, 0 to 3962
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   model      3963 non-null   object 
 1   task       3963 non-null   object 
 2   from_lang  3963 non-null   object 
 3   to_lang    3963 non-null   object 
 4   success    3963 non-null   float64
 5   valid      3963 non-null   float64
dtypes: float64(2), object(4)
memory usage: 185.9+ KB


In [10]:
# per model metrics
col = "model"
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 714.0
m_df["valid"] = m_df["valid"] / 714.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,model,success,valid
11,qwen2.5coder7b,0.310924,0.177871
10,qwen2.5coder2b,0.397759,0.138655
2,codellama7b,0.401961,0.12465
7,granitecode3b,0.519608,0.092437
4,deepseekcoder7b,0.308123,0.141457
8,granitecode8b,0.491597,0.088235
6,dolphincoder7b,0.35014,0.117647
9,magicoder7b,0.288515,0.119048
1,codegemma7b,0.385154,0.082633
3,codeqwen,0.268908,0.103641


In [11]:
# per task metrics
col = "task"
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 630.0
m_df["valid"] = m_df["valid"] / 630.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,task,success,valid
9,logic_ops,0.420635,0.188889
1,dir_make,0.468254,0.136508
0,bit_ops,0.484127,0.12381
8,int_factors,0.471429,0.11746
16,str_substring,0.45873,0.114286
11,str_cmp,0.430159,0.119048
2,file_create,0.412698,0.104762
4,file_rename,0.373016,0.092063
3,file_exists,0.390476,0.085714
14,str_match,0.380952,0.084127


In [12]:
# per from_lang metrics
col = "from_lang"
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 1530.0
m_df["valid"] = m_df["valid"] / 1530.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,from_lang,success,valid
4,js,0.436601,0.12549
5,py,0.418301,0.118954
3,java,0.393464,0.105229
2,go,0.322222,0.093464
0,c,0.302614,0.094118
1,cpp,0.298693,0.091503
6,rs,0.313072,0.084967


In [13]:
# per to_lang metrics
col = "to_lang"
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 1530.0
m_df["valid"] = m_df["valid"] / 1530.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{col}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,to_lang,success,valid
6,rs,0.564706,0.201961
0,c,0.494118,0.150327
2,go,0.468627,0.15817
1,cpp,0.522222,0.09085
3,java,0.251634,0.071242
4,js,0.080392,0.023529
5,py,0.103268,0.017647


In [14]:
# per model and task specific metrics
col = ["model", "task"]
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 42.0
m_df["valid"] = m_df["valid"] / 42.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,model,task,success,valid
68,deepseekcoder7b,bit_ops,0.595238,0.285714
160,magicoder7b,logic_ops,0.500000,0.333333
109,dolphincoder7b,logic_ops,0.523810,0.309524
152,magicoder7b,dir_make,0.523810,0.309524
126,granitecode3b,logic_ops,0.642857,0.214286
...,...,...,...,...
217,stablecode,str_prepend,0.357143,0.000000
218,stablecode,str_substring,0.928571,0.000000
98,deepseekcoderv2,str_match,0.119048,0.000000
205,stablecode,file_exists,0.952381,0.000000


In [15]:
# per from_lang and to_lang specific metrics
col = ["from_lang", "to_lang"]
m1_df = (
    metrics
    .groupby(col)[[x for x in metrics.columns if x not in ["model", "task", "from_lang", "to_lang", "success"]]]
    .sum()
    .reset_index()
)
m2_df = (
    metrics.groupby(col)
    .success.sum()
    .reset_index()
)
m_df = m2_df.merge(m1_df, how="left", on=col)
m_df["success"] = m_df["success"] / 255.0
m_df["valid"] = m_df["valid"] / 255.0
m_df["rank"] = m_df["valid"] * m_df["success"]
m_df = m_df.sort_values(["rank"], ascending=False)
m_df = m_df.drop("rank", axis=1)
m_df.to_csv(os.path.join(RESULTS_FOLDER, f"{"_".join(col)}_metrics_followup.csv"), index=False)
m_df

Unnamed: 0,from_lang,to_lang,success,valid
23,java,rs,0.611765,0.227451
29,js,rs,0.615686,0.219608
17,go,rs,0.541176,0.239216
35,py,rs,0.607843,0.203922
30,py,c,0.517647,0.203922
24,js,c,0.568627,0.160784
26,js,go,0.47451,0.188235
38,rs,go,0.466667,0.176471
1,c,go,0.462745,0.176471
5,c,rs,0.494118,0.164706
