### Process Val Run Outputs

##### 1. Create scoring sheet

- Load the finished val run result files (columns: ``video_id``, ``chunk_number``, ``output`` and ``error?``)

- Read in data with labels and merge results from val runs

- Arrange data in a convenient way for (for now: manual) scoring in excel sheet

- save to csv

In [1]:
# make sure that most recent progress files have been copied from google drive to repo!
# load test run output data (all csv files in directory)

test_run_path = "../data/inference_results/val_runs"

# get all csv files in directory
import os
val_csv_names = [f for f in os.listdir(test_run_path) if f.endswith('.csv')]
for f in val_csv_names:
    print(f)

val_llama3_ft_v3_q8_0_llamacpp_guided.csv
val_llama3_ft_v3_q8_0_llamacpp_unguided.csv
val_llama3_ft_v4_q8_0_llamacpp_guided.csv
val_llama3_ft_v4_q8_0_llamacpp_unguided.csv
val_llama3_q8_0_llamacpp_guided.csv
val_llama3_q8_0_llamacpp_unguided.csv
val_mistral_q8_0_llamacpp_guided.csv
val_mistral_q8_0_llamacpp_unguided.csv


In [2]:
# load labels and each csv file into a df
import pandas as pd
import json

# load labels (from labeling excel sheet)
df = pd.read_excel("../data/transcript_chunks/labeling/labeling_sheet.xlsx", sheet_name="VAL_labeling", skiprows=1)
df = df.drop(columns=["prompt", "done?"])
df = df.iloc[:, :df.columns.get_loc("label")+1] # drop all columns after 'label' column
print(df.shape)
# add special column(s)
df["n_extracted"] = df["label"].apply(lambda x: len(json.loads(x)))

dtypes = {
    'video_id': str,
    'chunk_number': int,
    'output': str,
    'error?': bool}

name_mapping = {}
for i, f in enumerate(val_csv_names):
    name_mapping[i] = f.replace(".csv", "").replace("val_", "")

    # load and prepare new df
    new_df = pd.read_csv(f"{test_run_path}/{f}", sep=";", dtype=dtypes)
    new_df = new_df.rename(columns={"output": f"{i}_output", "error?": f"{i}_error?"})

    # merge new df
    df = pd.merge(df, new_df, on=["video_id", "chunk_number"], suffixes=("", f"_{i}"))

    # add scoring columns
    # 1. columns which can be automatically computed
    df[f"{i}_n_extracted"] = df[f"{i}_output"].apply(lambda x: len(json.loads(x)) if not pd.isnull(x) else 0)
    df[f"{i}_correct_empty"] = (df["n_extracted"] == 0) & (df[f"{i}_n_extracted"] == 0)
    
    # 2. columns which need manual input
    # number of correct names extracted (can set to 0 already if n_extracted in the output or labels is 0)
    # DO WE INCLUDE NEUTRALS in this?
    df[f"{i}_correct_names"] = df.apply(lambda x: 0 if (x["n_extracted"] == 0) | (x[f"{i}_n_extracted"] == 0) else None, axis=1)
    # of correct names, how many asset types are correct?
    df[f"{i}_correct_asset_types"] = df.apply(lambda x: 0 if x[f"{i}_correct_names"] == 0 else None, axis=1)
    # of correct names, how many sentiments are correct?
    df[f"{i}_correct_sentiments"] = df.apply(lambda x: 0 if x[f"{i}_correct_names"] == 0 else None, axis=1)
    # additional neutrals extracted (-> should not be penalized?)
    df[f"{i}_extra_neutrals"] = df.apply(lambda x: 0 if sum([1 for s in (json.loads(x[f"{i}_output"]) if not x[f"{i}_error?"] else []) if s["sentiment"] == "neutral"]) == 0 else None, axis=1)


print(df.columns)

(150, 8)
Index(['video_id', 'chunk_number', 'yt_video_type', 'uploader_id', 'title',
       'first_three_tags', 'chunk_text', 'label', 'n_extracted', '0_output',
       '0_error?', '0_n_extracted', '0_correct_empty', '0_correct_names',
       '0_correct_asset_types', '0_correct_sentiments', '0_extra_neutrals',
       '1_output', '1_error?', '1_n_extracted', '1_correct_empty',
       '1_correct_names', '1_correct_asset_types', '1_correct_sentiments',
       '1_extra_neutrals', '2_output', '2_error?', '2_n_extracted',
       '2_correct_empty', '2_correct_names', '2_correct_asset_types',
       '2_correct_sentiments', '2_extra_neutrals', '3_output', '3_error?',
       '3_n_extracted', '3_correct_empty', '3_correct_names',
       '3_correct_asset_types', '3_correct_sentiments', '3_extra_neutrals',
       '4_output', '4_error?', '4_n_extracted', '4_correct_empty',
       '4_correct_names', '4_correct_asset_types', '4_correct_sentiments',
       '4_extra_neutrals', '5_output', '5_error?', '5

In [3]:
# save to csv (for loading into excel)
df.to_csv("../data/inference_results/val_runs_scoring/scoring_data_prepped.csv", index=False)



In [4]:
name_mapping

{0: 'llama3_ft_v3_q8_0_llamacpp_guided',
 1: 'llama3_ft_v3_q8_0_llamacpp_unguided',
 2: 'llama3_ft_v4_q8_0_llamacpp_guided',
 3: 'llama3_ft_v4_q8_0_llamacpp_unguided',
 4: 'llama3_q8_0_llamacpp_guided',
 5: 'llama3_q8_0_llamacpp_unguided',
 6: 'mistral_q8_0_llamacpp_guided',
 7: 'mistral_q8_0_llamacpp_unguided'}

In [5]:
# save name mapping to csv
name_mapping_df = pd.DataFrame.from_dict(name_mapping, orient="index", columns=["run_name"])
name_mapping_df["id"] = name_mapping_df.index
name_mapping_df.to_csv("../data/inference_results/val_runs_scoring/name_mapping.csv", index=False)

##### 2. Get Scores 

We want to examine model performance based on two types of statistics: 

##### I. Average of per-example scores
This score should lie in a range from 0 to 1 for each example. We would like to consider two aspects of model performance: 
   - reward correct extractions (while considering not only the asset name but also asset type and sentiment)
   - penalize mistakes (defined as extractions with wrong asset name and non-neutral sentiment).
  
We therefore build our metric out of two components:

   - reward component: for each extracted asset, award +1 point if the name is correct. Another +1 each for correct type and sentiment. (If the asset name is wrong, no more points for type and sentiment can be awarded). Normalize by dividing by the maximum number of achievable points. Cases with $n_{\text{labels}} = 0$ are handled separately to avoid division by zero (and to make perfect scores possible for these examples as well). The formula for the reward (between 0 and 1) is given by:
  
$$c_{\text{reward}} = 
\begin{cases} 
0 & \text{for } n_{\text{labels}} = 0 \\
\frac{n_{\text{correct names}} + n_{\text{correct types}} + n_{\text{correct sentiments}}}{3 \cdot n_{\text{labels}}} & \text{else. (}\Rightarrow n_{\text{labels}} > 0 \text{)}\\
\end{cases}$$

   - penalty component: since there is no maximum possible number of mistakes, we take the number of labeled assets for the example as the limit at which the maximum penalty is applied. The formula for the penalty (between 0 and 1) therefore is given by: 

$$\text{penalty} = 
\begin{cases} 
0 & \text{for } n_{\text{labels}} = n_{\text{mistakes}} = 0 \\
1 & \text{for } n_{\text{mistakes}} > n_{\text{labels}} \\
\frac{n_{\text{mistakes}}}{n_{\text{labels}}} & \text{else. (} n_{\text{mistakes}} \leq n_{\text{labels}} \text{ and } n_{\text{labels}} > 0 \text{)}\\

\end{cases}$$

   To combine the two components into a single score between 0 and 1, we transform the penalty to a reward by subtracting it from 1 and then take the average of the two components. (Here we could also weight reward and penalty differently according to our preferences.) We also have to consider the case of $n_{labels} = 0$ (and errors from unguided generation) separately to avoid division by zero. 

$$\text{score} = a \cdot \text{reward} + (1 - a) \cdot (1 - \text{penalty})$$

This approach will put less weight on the few outliers with a very high number of mentioned assets (e.g. 20+) which highly influence the overall totals. 
Meanwhile, examples with no extractions to be made ($n_labels = 0$) have a big influence, since a 100% score is possible for them and they appear frequently in our data. To get as clear a picture as possible, we will compute the final scores for the validation run WITH and WITHOUT empty examples.

##### II. Totals over validation dataset

   - Error rate (for unguided generation)
   - correct asset names
   - correct asset types (if name is correct)
   - correct sentiments (if name is correct)
   - mistakes in asset names


Looking at the totals as a scoring approach will put more importance on the outliers with a high number of assets mentioned in the example. 

In [1]:
# define scoring function
def get_score(row, id, reward_weight=0.5):
    n_labels = row["n_extracted"] # ground truth
    n_extracted = row[f"{id}_n_extracted"] # model output
    n_correct_names = row[f"{id}_correct_names"]
    n_correct_asset_types = row[f"{id}_correct_asset_types"]
    n_correct_sentiments = row[f"{id}_correct_sentiments"]
    n_extra_neutrals = row[f"{id}_extra_neutrals"]
    n_mistakes = row[f"{id}_n_mistakes"] # not in scoring sheet, compute before calling this function!
    
    # compute reward component
    if n_labels == 0 and n_mistakes > 0:
        reward = 0
    elif n_labels == 0 and n_mistakes == 0: # empty labels and correct (no) extractions ->, what should the reward be here? I think 0 is better to avoid rewarding very conservative models too much on our dataset
        reward = 0
    else:
        reward = (n_correct_names + n_correct_asset_types + n_correct_sentiments) / (n_labels * 3)
    
    # compute penalty component
    if n_labels == 0 and n_mistakes == 0:
        penalty = 0
    elif n_mistakes > 0:
        penalty = 1
    else:
        penalty = n_mistakes / n_labels
    
    # compute final score
    score = reward_weight * reward + (1 - reward_weight) * (1 - penalty)
    return score

# helper function to compute n_mistakes column
def get_n_mistakes(row, id):
    # mistake is defined as wrong asset name and non-neutral sentiment
    n_mistakes = row[f"{id}_n_extracted"] - row[f"{id}_correct_names"] - row[f"{id}_extra_neutrals"]

    if n_mistakes < 0: # should not happen but manual scoring sheet could theoretically contain errors
        print(f"Warning: n_mistakes < 0 for video_id {row['video_id']}, chunk_number {row['chunk_number']}, run {id}! (n_mistakes set from {n_mistakes} to 0.)")
        n_mistakes = 0
    return n_mistakes

In [45]:
# read in scoring sheet and name mapping
import pandas as pd
df = pd.read_excel("../data/inference_results/val_runs_scoring/scoring_sheet.xlsx", sheet_name="scoring_data_prepped")
name_mapping = pd.read_csv("../data/inference_results/val_runs_scoring/name_mapping.csv")

In [46]:
# for each run, compute per-example scores
for id, run_name in zip(name_mapping["id"], name_mapping["run_name"]):
    df[f"{id}_n_mistakes"] = df.apply(get_n_mistakes, axis=1, id=id)
    df[f"{id}_score_0.33"] = df.apply(get_score, axis=1, id=id, reward_weight=0.33)
    df[f"{id}_score_0.5"] = df.apply(get_score, axis=1, id=id, reward_weight=0.5)
    df[f"{id}_score_0.67"] = df.apply(get_score, axis=1, id=id, reward_weight=0.67)

# get aggregated data for each run (two versions: with and without zero labels (i.e. '[]' examples))
agg_data = {}
agg_data_without_zero_labels = {}

for dict, data in zip([agg_data, agg_data_without_zero_labels], [df.copy(), df[df["n_extracted"] > 0].copy()]):
    for id, run_name in zip(name_mapping["id"], name_mapping["run_name"]):

        dict[run_name] = {
            # for convenience/later use
            "run_id": id,
            "run_name": run_name,
            "guided_model": True if "_guided" in run_name else False,
            "n_examples": data.shape[0],
            # means
            "mean_score_0.33": data[f"{id}_score_0.33"].mean(),
            "mean_score_0.5": data[f"{id}_score_0.5"].mean(),
            "mean_score_0.67": data[f"{id}_score_0.67"].mean(),
            "invalid_output_rate": data[f"{id}_error?"].mean(),
            # totals
            "n_labels": data["n_extracted"].sum(), # ground truth
            "n_correct_names": data[f"{id}_correct_names"].sum(),
            "n_correct_asset_types": data[f"{id}_correct_asset_types"].sum(),
            "n_correct_sentiments": data[f"{id}_correct_sentiments"].sum(),
            "n_mistakes": data[f"{id}_n_mistakes"].sum(),
            "n_extra_neutrals": data[f"{id}_extra_neutrals"].sum(),
        }

# convert dicts to dfs
agg_data_df = pd.DataFrame.from_dict(agg_data, orient="index").reset_index(drop=True)
agg_data_without_zero_labels_df = pd.DataFrame.from_dict(agg_data_without_zero_labels, orient="index").reset_index(drop=True)

# save to csv
agg_data_df.to_csv("../data/inference_results/val_runs_scoring/scoring_results.csv", sep=";", index=False)
agg_data_without_zero_labels_df.to_csv("../data/inference_results/val_runs_scoring/scoring_results_without_zero_labels.csv", sep=";", index=False)



In [47]:
agg_data_df

Unnamed: 0,run_id,run_name,guided_model,n_examples,mean_score_0.33,mean_score_0.5,mean_score_0.67,invalid_output_rate,n_labels,n_correct_names,n_correct_asset_types,n_correct_sentiments,n_mistakes,n_extra_neutrals
0,0,llama3_ft_v3_q8_0_llamacpp_guided,True,150,0.700544,0.556582,0.41262,0.0,158,61,61,60,3,1
1,1,llama3_ft_v3_q8_0_llamacpp_unguided,False,150,0.702011,0.558804,0.415598,0.0,158,63,63,62,4,0
2,2,llama3_ft_v4_q8_0_llamacpp_guided,True,150,0.681362,0.565297,0.449231,0.0,158,97,95,85,17,6
3,3,llama3_ft_v4_q8_0_llamacpp_unguided,False,150,0.676896,0.561963,0.447031,0.0,158,97,95,85,18,5
4,4,llama3_q8_0_llamacpp_guided,True,150,0.542754,0.475486,0.408218,0.0,158,117,115,100,75,83
5,5,llama3_q8_0_llamacpp_unguided,False,150,0.551834,0.465203,0.378572,0.133333,158,85,84,71,47,69
6,6,mistral_q8_0_llamacpp_guided,True,150,0.674301,0.537426,0.400551,0.0,158,78,77,57,13,30
7,7,mistral_q8_0_llamacpp_unguided,False,150,0.666227,0.514889,0.363551,0.086667,158,28,28,22,7,7


In [48]:
agg_data_without_zero_labels_df

Unnamed: 0,run_id,run_name,guided_model,n_examples,mean_score_0.33,mean_score_0.5,mean_score_0.67,invalid_output_rate,n_labels,n_correct_names,n_correct_asset_types,n_correct_sentiments,n_mistakes,n_extra_neutrals
0,0,llama3_ft_v3_q8_0_llamacpp_guided,True,54,0.767252,0.666432,0.565611,0.0,158,61,61,60,2,0
1,1,llama3_ft_v3_q8_0_llamacpp_unguided,False,54,0.758919,0.663345,0.567771,0.0,158,63,63,62,4,0
2,2,llama3_ft_v4_q8_0_llamacpp_guided,True,54,0.788414,0.746194,0.703974,0.0,158,97,95,85,7,1
3,3,llama3_ft_v4_q8_0_llamacpp_unguided,False,54,0.788414,0.746194,0.703974,0.0,158,97,95,85,7,1
4,4,llama3_q8_0_llamacpp_guided,True,54,0.688762,0.709684,0.730607,0.0,158,117,115,100,23,20
5,5,llama3_q8_0_llamacpp_unguided,False,54,0.676762,0.653343,0.629924,0.203704,158,85,84,71,16,11
6,6,mistral_q8_0_llamacpp_guided,True,54,0.706762,0.622479,0.538197,0.0,158,78,77,57,11,18
7,7,mistral_q8_0_llamacpp_unguided,False,54,0.684333,0.559877,0.43542,0.222222,158,28,28,22,5,7
