# Obtaining results of the models 

In [None]:
import os 
import sys
import random
import numpy as np
import pandas as pd
from datasets import Dataset
from tqdm.notebook import tqdm
from warnings import warn

In [None]:
sys.path.append("../")
sys.path.append("../../")

## General setups

In [None]:
change_dataset_names = {"falconcode_skill": "falconcode_easy", "falconcode_lab": "falconcode_hard"}

In [None]:
def get_full_paths(config_names, datasets=["falconcode_skill", "falconcode_lab", "singapore"]):
    base_path = "TO_SPECIFY"
    exp_dir = "sft/results/"
    full_paths = []
    for config_name in config_names:
        for dataset in datasets:
            filename = f"{dataset}_results.json"
            yield os.path.join(base_path, config_name, exp_dir, filename)

In [None]:
def create_result_table(all_paths, scoring, scoring_name):
    table_frame = []
    for path in all_paths:
        if not os.path.exists(path):
            warn(f"experiment in path {path} did not finnish or failed")
            continue
        
        ds_name = path.split("/")[-1].replace("_results.json", "")
        model_name = path.split("/")[-4].replace("_falcon", "")
        #model_name = change_model_names.get(model_name, "")
        frame = pd.Series(scoring(path)).rename(model_name).to_frame().T
        frame["dataset"] = change_dataset_names.get(ds_name, ds_name)
        table_frame.append(frame)

    results_table = pd.concat(table_frame, axis=0)
    results_table.columns = [c.replace(f"{scoring_name}@", "k = ") 
                             for c in results_table.columns if c.startswith(scoring_name)] + ["dataset"]
    selected_pass = [1, 5, 10]
    selected_columns = [c for c in results_table.columns 
                        if c.startswith("k = ")]
    selected_columns = [c for c in selected_columns
                        if int(c.replace(f"k = ", "")) in selected_pass]
    results_table = results_table[selected_columns + ["dataset"]]
    
    # change to a more appreciable format
    results_table = (results_table
                     .reset_index()
                     .groupby(['index', 'dataset'])[selected_columns]
                     .aggregate('first').unstack()
                     .swaplevel(axis=1).sort_values(by="dataset", axis=1))
    results_table = results_table.fillna(0)
    results_table.columns.names = ['dataset', scoring_name]
    results_table *= 100
    results_table = results_table.round(3)
    
    return results_table

In [None]:
change_model_names = {"codegen-2B-mono": "codegen-mono", "codegen-2B-multi": "codegen-multi", "codegen-350M-multi": "codegen-multi", "santacoder": "santacoder",
                      "codegen-350M-mono": "codegen-mono", "tiny_starcoder": "starcoder", "starcoder-1B": "starcoder", "starcoder-3B": "starcoder", "codegen2-3_7B": "codegen2"}

In [None]:
sizes = {"codegen-2B-mono": "2B", "codegen-2B-multi": "2B", "codegen-350M-multi": "350M", 
         "codegen-350M-mono": "350M", "tiny_starcoder": "164M", "starcoder-3B": "3B", "codegen2-3_7B": "3.7B"}

In [None]:
experiments_names = ["tiny_starcoder_falcon", "codegen-350M-mono_falcon",
                     "starcoder-1B_falcon", "codegen-2B-mono_falcon", 
                     "starcoder-3B_falcon", "codegen2-3_7B_falcon"]
all_paths = list(get_full_paths(experiments_names))

In [None]:
replace = lambda series: pd.Series([s.replace("B","000000").replace("M", "00") for s in series])

### Pass_at_k results

In [None]:
from src.utils.files import json2data

def get_pass(path):
    return json2data(path)["pass_at_k"]

In [None]:
results_table = create_result_table(all_paths, get_pass, "pass")
results_table.index = pd.MultiIndex.from_tuples([(i, sizes.get(i, "1B")) for i in results_table.index])
results_table.index.names = ["model", "size"]
results_table = results_table.reset_index(level=1, drop=False)
results_table.index = [change_model_names.get(i, i) for i in results_table.index]
results_table = results_table.sort_values(by="size", key=lambda s: replace(s).astype(int), ascending=True)
results_table

In [None]:
caption = "Pass@k for our two scenarios"
results_table.index.name = "model"
results_table.columns.names = ["", ""]
print(results_table.to_latex(multirow=True, multicolumn=True, longtable=False, 
                             float_format="%.2f", escape=True, multicolumn_format='c', 
                             bold_rows=False, 
                             caption=caption, label="tab: pass_at", position="htbp!"))

### rouge_at_k results

In [None]:
from itertools import combinations
from src.utils.distance import rougelcsum_dist

def compute_buggy_rouge_at_k(prob_df):
    buggy = prob_df["source_code"].iloc[0]
    gencor = prob_df["generation_correct"]
    rouges = [rougelcsum_dist(buggy, gen, get_score=True) for gen in prob_df["generation"]]
    # we put that at 0 (minimum score) if the code is not correct
    rouges = [r if c else 0 for r, c in zip(rouges, gencor)]
    # we estimate rouge by taking all unique combinations
    rouge_at_k = {f"rouge@{k}": np.mean([max(score) for score in combinations(rouges, k)]) 
                  for k in range(1, len(rouges) + 1)}

    return pd.Series(rouge_at_k)
  
def get_rouge(path):
    df = pd.DataFrame(json2data(path)["eval_ds"])
    if "id" not in df:
        warn(f"Path {path} does not have the column id, results invalid")
        return pd.Series()
    
    rouge_at = df.groupby("id").apply(compute_buggy_rouge_at_k)
    rouge_at = rouge_at.mean(axis=0)
    return rouge_at

In [None]:
results_table = create_result_table(all_paths, get_rouge, "rouge")
results_table.index = pd.MultiIndex.from_tuples([(i, sizes.get(i, "1B")) for i in results_table.index])
results_table.index.names = ["model", "size"]
results_table = results_table.reset_index(level=1, drop=False)
results_table.index = [change_model_names.get(i, i) for i in results_table.index]
results_table = results_table.sort_values(by="size", key=lambda s: replace(s).astype(int), ascending=True)
results_table

In [None]:
caption = "Rouge results"
results_table.index.name = ""
results_table.columns.names = ["", ""]
print(results_table.to_latex(multirow=True, multicolumn=True, longtable=False, 
                             float_format="%.2f", escape=True, multicolumn_format='c', 
                             bold_rows=False, 
                             caption=caption, label="tab: rouge_at", position="htbp!"))

## Manual analysis of model generations

In [None]:
all_paths

In [None]:
data = pd.DataFrame(json2data(all_paths[1])["eval_ds"])
data

In [None]:
data.generation_correct.value_counts()

In [None]:
data = data[data.generation_correct]

In [None]:
import random
i = random.choice(range(len(data)))
i

In [None]:
print(data.source_code.iloc[i])

In [None]:
print(data.generation.iloc[i])

In [None]:
print(data[data.generation_correct]["source_code"].iloc[3])

In [None]:
print(data[data.generation_correct]["generation"].iloc[0])

In [None]:
data.columns

In [None]:
data.type.unique()