# Processing results from different VLMs

In [1]:
#imports
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## Local VLMs for scenario prediction (from frames)

In [2]:
#go through all files that start with 'results_', then turn it into a pandas dataframe, 
# #then put everything into a single df where the prediction if the mode of the frames
def load_results(directory):
    results = pd.DataFrame()
    full_results = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.startswith('results_'):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            #add column in the beginning with the filename without 'results_' and '.csv'
            df['model'] = filename[8:-4]
            #make it first column
            df = df[['model'] + [col for col in df.columns if col != 'model']]

            #group by video_name and take the mode of the predictions
            df_grouped = df.groupby('video_name').agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan).reset_index()
            #add the grouped dataframe to the full results
            full_results = pd.concat([full_results, df_grouped], ignore_index=True)

            results = pd.concat([results, df], ignore_index=True)

    #group by filename, by video, and take the mode of the predictions

    return results, full_results

In [3]:
path = '../results/'
results, full_results = load_results(path)

#remove NaN values from the results
print("Removing NaN values from the results...")
print(results.shape)
#drop rows if 'outcome_prediction' contains word "Error"
results = results[~results['outcome_prediction'].str.contains("Error", na=False)]
print(results.shape)
print(full_results.shape)
full_results = full_results[~full_results['outcome_prediction'].str.contains("Error", na=False)]
print("Full results after removing NaN values:")
print(full_results.shape)

#save the results to a csv file
results.to_csv('all_predictions.csv', index=False)
#save the full results to a csv file
full_results.to_csv('all_predictions_grouped.csv', index=False)

#see how many unique videos are for each model
unique_videos_per_model = results.groupby('model')['video_name'].nunique().reset_index()
print(unique_videos_per_model)

Removing NaN values from the results...
(4712, 4)
(3534, 4)
(240, 4)
Full results after removing NaN values:
(180, 4)
           model  video_name
0         gemma3          30
1     gemma3_27b          30
2  llama32vision          30
3          llava          30
4    llavallama3          30
5         qwen25          30


In [4]:
#now, per model, print how many videos are "Poorly" and how many are "Well", accounting for small variations in the writing
for model in full_results['model'].unique():
    print(f"Model: {model}")
    model_results = full_results[full_results['model'] == model]
    poorly_count = model_results['outcome_prediction'].str.contains('Poorly', case=False, na=False).sum()
    well_count = model_results['outcome_prediction'].str.contains('Well', case=False, na=False).sum()
    print(f"  Poorly: {poorly_count}")
    print(f"  Well: {well_count}")

    

Model: llama32vision
  Poorly: 30
  Well: 0
Model: llavallama3
  Poorly: 5
  Well: 25
Model: gemma3
  Poorly: 30
  Well: 0
Model: qwen25
  Poorly: 14
  Well: 16
Model: gemma3_27b
  Poorly: 30
  Well: 0
Model: llava
  Poorly: 25
  Well: 5


In [5]:
#for outcome_prediction, replace poorly with 1 and well with 0
#if it contains "poorly", make new list with 1, if it contains "well", make new list with 0
full_results['outcome_prediction_numeric'] = full_results['outcome_prediction'].apply(
    lambda x: 1 if 'poorly' in str(x).lower() else (0 if 'well' in str(x).lower() else np.nan)
)
#save the full results with numeric outcome prediction to a csv file
full_results.to_csv('all_predictions_grouped.csv', index=False)


full_results


Unnamed: 0,video_name,model,frame,outcome_prediction,outcome_prediction_numeric
30,11_final.mp4,llama32vision,0,Poorly.,1
31,12_final.mp4,llama32vision,0,Poorly.,1
32,14_final.mp4,llama32vision,0,Poorly.,1
33,15_final.mp4,llama32vision,0,Poorly.,1
34,19_final.mp4,llama32vision,0,Poorly.,1
...,...,...,...,...,...
205,59_final.mp4,llava,0,Well,0
206,60_final.mp4,llava,0,Poorly,1
207,6_final.mp4,llava,0,Poorly,1
208,7_final.mp4,llava,0,Poorly,1


In [6]:
#now, get the groundtruth and see if they got it right
#open csv with columns Video,Question Mapping,Average Class of Human Predicion,True Outcome,

gt_df = pd.read_csv('../../dataset_scenarios/analyze_predictions.csv')

#map each video to its true outcome
gt_df = gt_df[['Video', 'True Outcome']].rename(columns={'Video': 'video_name', 'True Outcome': 'true_outcome'})
#video are name without the .mp4 extension, so we need to add it to each one
gt_df['video_name'] = gt_df['video_name'].apply(lambda x: x + '.mp4' if not x.endswith('.mp4') else x)
#print type of true_outcome
gt_df['true_outcome'] = gt_df['true_outcome'].astype(int)
#dictionary to map video names to true outcomes
gt_dict = dict(zip(gt_df['video_name'], gt_df['true_outcome']))

#add the "true_outcome" column to the full_results dataframe
full_results['true_outcome'] = full_results['video_name'].map(gt_dict)

full_results


FileNotFoundError: [Errno 2] No such file or directory: '../../dataset_scenarios/analyze_predictions.csv'

In [None]:

#models and metrics df
prediction_performance_df = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'recall', 'f1_score'])


#now, go video by video and see if the model got it right
correct_predictions = []
for model in full_results['model'].unique():
    model_results = full_results[full_results['model'] == model]
    y_pred = model_results['outcome_prediction_numeric']
    y_true = []
    for index, row in model_results.iterrows():
        video_name = row['video_name']
        true_outcome = gt_dict.get(video_name, np.nan)
        y_true.append(true_outcome)

    
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    #print(y_pred)
    #print(y_true)

    #get accuracy, precision, recall, f1 score
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"Model: {model}")
    print(f"  Accuracy: {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")
    print(f"  F1 Score: {f1:.2f}")

    df_pred = pd.DataFrame({
        'model': [model],
        'accuracy': [accuracy],
        'precision': [precision],
        'recall': [recall],
        'f1_score': [f1]
    })
    prediction_performance_df = pd.concat([prediction_performance_df, df_pred], ignore_index=True)

#save the prediction performance to a csv file

prediction_performance_df.to_csv('prediction_performance.csv', index=False)
#save full results with true outcome to a csv file
full_results.to_csv('all_predictions_grouped_with_true_outcome.csv', index=False)

prediction_performance_df


Model: llama32vision
  Accuracy: 0.43
  Precision: 0.43
  Recall: 1.00
  F1 Score: 0.60
Model: llavallama3
  Accuracy: 0.53
  Precision: 0.40
  Recall: 0.15
  F1 Score: 0.22
Model: gemma3
  Accuracy: 0.43
  Precision: 0.43
  Recall: 1.00
  F1 Score: 0.60
Model: qwen25
  Accuracy: 0.50
  Precision: 0.43
  Recall: 0.46
  F1 Score: 0.44
Model: gemma3_27b
  Accuracy: 0.43
  Precision: 0.43
  Recall: 1.00
  F1 Score: 0.60
Model: llava
  Accuracy: 0.33
  Precision: 0.36
  Recall: 0.69
  F1 Score: 0.47


  prediction_performance_df = pd.concat([prediction_performance_df, df_pred], ignore_index=True)


Unnamed: 0,model,accuracy,precision,recall,f1_score
0,llama32vision,0.433333,0.433333,1.0,0.604651
1,llavallama3,0.533333,0.4,0.153846,0.222222
2,gemma3,0.433333,0.433333,1.0,0.604651
3,qwen25,0.5,0.428571,0.461538,0.444444
4,gemma3_27b,0.433333,0.433333,1.0,0.604651
5,llava,0.333333,0.36,0.692308,0.473684
