# Processing results from different VLMs

In [5]:
#imports
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Local VLMs for scenario prediction (from frames)

In [6]:
#go through all files that start with 'results_', then turn it into a pandas dataframe, 
# #then put everything into a single df where the prediction if the mode of the frames
def load_results(directory):
    results = pd.DataFrame()
    full_results = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.startswith('results_'):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            #add column in the beginning with the filename without 'results_' and '.csv'
            df['model'] = filename[8:-4]
            #make it first column
            df = df[['model'] + [col for col in df.columns if col != 'model']]

            #group by video_name and take the mode of the predictions
            df_grouped = df.groupby('video_name').agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan).reset_index()
            #add the grouped dataframe to the full results
            full_results = pd.concat([full_results, df_grouped], ignore_index=True)

            results = pd.concat([results, df], ignore_index=True)

    #group by filename, by video, and take the mode of the predictions

    return results, full_results

In [7]:
path = '.'
results, full_results = load_results(path)
#save the results to a csv file
results.to_csv('all_predictions.csv', index=False)
#save the full results to a csv file
full_results.to_csv('all_predictions_grouped.csv', index=False)

#see how many unique videos are for each model
unique_videos_per_model = results.groupby('model')['video_name'].nunique().reset_index()
print(unique_videos_per_model)

           model  video_name
0         gemma3          30
1     gemma3_27b          30
2  llama32vision           9
3          llava          30
4         qwen25          30


In [8]:
#now, per model, print how many videos are "Poorly" and how many are "Well", accounting for small variations in the writing
for model in full_results['model'].unique():
    print(f"Model: {model}")
    model_results = full_results[full_results['model'] == model]
    poorly_count = model_results['outcome_prediction'].str.contains('Poorly', case=False, na=False).sum()
    well_count = model_results['outcome_prediction'].str.contains('Well', case=False, na=False).sum()
    print(f"  Poorly: {poorly_count}")
    print(f"  Well: {well_count}")
    

Model: llama32vision
  Poorly: 9
  Well: 0
Model: gemma3
  Poorly: 30
  Well: 0
Model: qwen25
  Poorly: 14
  Well: 16
Model: gemma3_27b
  Poorly: 30
  Well: 0
Model: llava
  Poorly: 25
  Well: 5
