# CSV Evaluator

### Environmental Setup

Import necessary modules.

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Define helper functions

In [2]:
def calculate_scores(y_true, y_pred, round_digits=3):
    precision = round(precision_score(y_true, y_pred, zero_division=0), round_digits)
    recall    = round(recall_score(y_true, y_pred, zero_division=0), round_digits)
    f1        = round(f1_score(y_true, y_pred, zero_division=0), round_digits)
    accuracy  = round(accuracy_score(y_true, y_pred), round_digits)
    
    return precision, recall, f1, accuracy

# Custom sorting function
def custom_sort(df):
    # Assign a priority column for sorting
    df['priority'] = df.apply(
        lambda row: 0 if row['Country'] == row['Race'] else 1 
                    if row['Race'] != 'total' else 2, axis=1
    )
    # Sort by Country, then by priority, and reset index
    df = df.sort_values(by=['Country', 'priority']).drop(columns='priority').reset_index(drop=True)
    return df

def consolidate_one_country_name(country_name):
    if country_name.startswith('American'):
        return 'US'
    elif country_name.startswith('Britain'):
        return 'UK'

def highlight_row(row):
    if row['Country'] == row['Race']:
        return ['background-color: #690150'] * len(row) # data for original images are red
    
    if row['Race'] == 'Synthesized Total':
        return ['background-color: #016950'] * len(row) # data for total images are green
    
    return [''] * len(row)

### Cultural Identification Metrics

In [3]:
def cultural_identification_metrics(file_path, race_verbose=True, misclassified_verbose=False):
    # 1. Load the data
    data = pd.read_csv(file_path)
    
    # 2. Create 'Match' column. Match if `original_country` is mentioned in the `response`
    data['Match_country'] = data.apply(
        lambda row: 1 if any(term.lower().strip() in row['response'].lower() for term in row['original_country'].split(',')) else 0, 
        axis=1
    )
    
    # 3. Normalize the text of both `original_country` to ensure consistent grouping (e.g., remove duplicates, ordering effect, etc.)
    data['original_country'] = data['original_country'].apply(lambda x: ', '.join(sorted(set(term.strip() for term in x.split(',')))))        

    data['original_country'] = data['original_country'].apply(consolidate_one_country_name) 
    
    # Get unique combinations of country and race
    unique_combinations = data.groupby(['original_country', 'synthesized_race']).size().reset_index()
    unique_combinations.columns = ['original_country', 'synthesized_race', 'count']
    
    unique_countries = data['original_country'].unique()
    
    # Initialize results
    country_results = []
    misclassified_images = []
        
    # Add a row for the 'total'
    for country in unique_countries:
        # if there is only one synthesized_race for a country in unique_combinations, then skip adding 'total' row
        if len(unique_combinations[unique_combinations['original_country'] == country]) > 1:
            new_row = {'original_country': country, 'synthesized_race': 'Synthesized Total', 'count': 0}
            unique_combinations = pd.concat([unique_combinations, pd.DataFrame([new_row])], ignore_index=True)
    

    # Compute metrics for each country-race combination
    for index, row in unique_combinations.iterrows():
        country = row['original_country']
        race = row['synthesized_race']
        
        if race == 'Synthesized Total':
            subset = data[(data['original_country'] == country) & (data['original_country'] != data['synthesized_race'])]
        else:
            subset = data[(data['original_country'] == country) & (data['synthesized_race'] == race)]
        
        y_true = [1] * len(subset)  # Ground truth: should mention the country
        y_pred = subset['Match_country'].tolist()

        precision, recall, f1, accuracy = calculate_scores(y_true, y_pred)                

        # Identify misclassified samples
        if race != 'total':
            misclassified_indices = [i + (row['count'] * index) for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]  
            misclassified_images.extend(data.iloc[misclassified_indices]['image_file_name'].tolist())
                
        country_results.append({
            'Country': country,
            'Race': race,
            # 'Precision': precision,
            # 'Recall': recall,
            # 'F1 Score': f1,
            'Accuracy': accuracy,
            'Correct Samples': sum(y_pred),    # sum of 1s
            'Total Samples': len(subset)      # total rows in this group
        })

    if misclassified_verbose:
            print(f"Misclassified images: {misclassified_images}")
            misclassified_data = data[data['image_file_name'].isin(misclassified_images)]
            misclassified_indices = misclassified_data.index.tolist()
            print(f"Misclassified indices: {misclassified_indices}")
            print(f"Misclassified data: ")
            for response in misclassified_data['response']:
                print(response)
   

    # Convert results to a DataFrame and sort
    country_results_df = custom_sort(pd.DataFrame(country_results))
    
    # # Apply highlighting and ensuring truncation
    country_results_df = country_results_df.style.apply(highlight_row, axis=1).format(precision=2)
    
    # only select rows with the races being 'total'
    if not race_verbose:
        country_results_df = country_results_df[country_results_df['Race'] == 'Synthesized Total']
                

    return country_results_df

You can provide a csv file with response data for
- original images of a single country/ multiple countries (eg. `images/Korea_Original_Food_Results.csv`)
- synthesized images of a single country/ multiple countries (eg. `responses/internVL/internVL_master_synthesized_food.csv`)
- original + synthesized images of a single country/multiple countries (eg. `responses/gpt4o/gpt4o_master_clothes.csv`)

Rows for original iamges are highlighted red, and rows for total are highlighted green.

In [5]:
file_path = '../responses/glm4v/US_Food_Results.csv'
results_df = cultural_identification_metrics(file_path, race_verbose=True, misclassified_verbose=False)
display(results_df)

Unnamed: 0,Country,Race,Accuracy,Correct Samples,Total Samples
0,US,US,0.91,30,33
1,US,African,0.97,32,33
2,US,Caucasian,0.94,31,33
3,US,EastAsian,0.91,30,33
4,US,SouthAsian,0.88,29,33
5,US,Synthesized Total,0.92,122,132


## Label Identification Metrics

In [6]:
import os
from zhipuai import ZhipuAI
from dotenv import load_dotenv

load_dotenv()
GLM_MODEL_API_KEY = os.getenv('GLM_MODEL_API_KEY')
client = ZhipuAI(api_key=GLM_MODEL_API_KEY)

# Define the function
def heuristics_match_label(row):
    terms = [term.lower().strip() for term in row['label'].split(',')]
    if any(term in row['response'].lower() for term in terms):
        #     print(f"Filename: {row['image_file_name']} Matched: {terms} in {row['response']}")
            return 1
    
    return 0

def ai_match_label(row):
        system = '''
        You'll be provided with a label and a response by a multi-modal LLM that identifies the name of the food in a picture. 
        Your task is to determine whether the food name contained in the response can be considered as correct given the ground-truth label. 
        under label. Consider it as correct ('Yes') if you think both refers to the same food semantically either in native language or in English.
        Otherwise, 'No'. They don't need to match exactly, but if the answer is too generic, don't accept either. Answer only 'Yes' or 'No'.
        '''
        prompt = f'System: {system} | Response: {row['response']} | Label: {row['label']}'

        answer = client.chat.completions.create(
                model='glm-4-plus',
                messages=[
                {
                        "role": "user",
                        "content": [
                        {
                                "type": "text",
                                "text": prompt,                                
                        },
                        ],
                }
                ],
                max_tokens=10,
        )
        
        if answer.choices[0].message.content.lower() == 'yes':
                # print(f'File: {row['image_file_name']} | Label: {row['label']} |  Response: {row['response']}')
                return 1
        
        return 0

In [7]:
def food_identification_metrics(file_path, race, master_label_df=None, misclassified_verbose=False):        
        data = pd.read_csv(file_path)
        assert race in ['original', 'African', 'EastAsian', 'SouthAsian', 'Caucasian'], "Invalid race value. Choose from 'original', 'Asian', 'Black', 'White', 'Indian', 'Caucasian'"
        
        # drop rows whose race doesn't match with race parameter
        if race == 'original':
                data = data[~data['synthesized_race'].isin(['African', 'EastAsian', 'SouthAsian', 'Caucasian'])]
        else:
                data = data[data['synthesized_race'] == race]

        try:
                data['Match_label'] = data.apply(ai_match_label,axis=1)
                data['label'] = data['label'].apply(lambda x: ', '.join(sorted(set(term.strip() for term in x.split(',')))))
        except KeyError:
                print(f"Label column not found in the given response csv file. Ensure that the file is for food data and there is a column named 'label'.")
                return None

        label_results = []
        misclassified_images = []
        unique_labels = data['label'].unique()

        for lbl in unique_labels:
                label_data = data[data['label'] == lbl]
                y_true = [1] * len(label_data)  
                y_pred = label_data['Match_label'].tolist()

                precision, recall, f1, accuracy = calculate_scores(y_true, y_pred)
                
                # Identify misclassified samples
                misclassified_indices = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]
                
                # get the file image names of misclassified samples
                misclassified_images.extend(label_data.iloc[misclassified_indices]['image_file_name'].tolist())
                
                label_results.append({
                'Label': lbl,
                'Total Samples': len(label_data),
                race: accuracy,
                'Correct Samples': sum(y_pred),
                })
        
        total_misclassified_images.extend(misclassified_images)
        
        if misclassified_verbose:
                misclassified_data = data[data['image_file_name'].isin(misclassified_images)]
                misclassified_indices = misclassified_data.index.tolist()
                print(f"Misclassified indices: {misclassified_indices}")
                print(f"Misclassified data:\n {misclassified_data}")
        # Convert label results to a DataFrame
        label_results_df = pd.DataFrame(label_results)

        # Calculate totals using pandas methods
        totals = pd.Series({
                'Label': 'total',
                'Total Samples': label_results_df['Total Samples'].sum(),
                race: round(label_results_df['Correct Samples'].sum() / label_results_df['Total Samples'].sum(), 3),
                'Correct Samples': label_results_df['Correct Samples'].sum(),
        })

        # Append totals row to the DataFrame
        label_results_df = pd.concat([label_results_df, pd.DataFrame([totals])], ignore_index=True)
        # drop correct samples column
        label_results_df = label_results_df.drop(columns='Correct Samples')
        # if there is master label df to be combined with label_results_df, combine based on the 'Label' and  'Total samples' columns
        if master_label_df is not None:
                label_results_df = pd.merge(master_label_df, label_results_df, on=['Label', 'Total Samples'], how='outer')
        
        return label_results_df

### Evaluate

In [9]:
model = 'internVL'
category = 'US_Food'
file_path = f'../responses/{model}/{category}_Results.csv'


master_label_df = None
total_misclassified_images = []
races = ['original', 'African', 'EastAsian', 'SouthAsian', 'Caucasian']

for race in races: 
    label_df = food_identification_metrics(file_path, race, master_label_df=master_label_df, misclassified_verbose=False)
    master_label_df = label_df

# write total_misclassified_images to a txt file
with open(f'misclassified_food/{model}_{category}.txt', 'w') as f:
    for item in total_misclassified_images:
        f.write("%s\n" % item)

master_label_df

AssertionError: Invalid race value. Choose from 'original', 'Asian', 'Black', 'White', 'Indian', 'Caucasian'

### Testing

In [15]:
# Function to create a request for each image
def evaluate_response(client, prompt):
    answer = client.chat.completions.create(
        model='glm-4-plus',
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,                                
                    },
                ],
            }
        ],
        max_tokens=100,
    )
    return answer

In [None]:
client = ZhipuAI(api_key=GLM_MODEL_API_KEY)
system = '''
You'll be provided with a label and a response by a multi-modal LLM that identifies the name of the food in a picture. 
Your task is to determine whether the food name contained in the response can be considered as correct given the ground-truth label. 
under label. Consider it as correct ('Yes') if you think both refers to the same food semantically either in native language or in English.
Otherwise, 'No'. They don't need to match exactly. Answer only 'Yes' or 'No'.
'''
response = "The food in the photo is called ""Mee Goreng"" and it is mostly associated with Malaysia and Indonesia."
label = 'Nan Gyi Thoke or Noodle salad,Khao Soi Thoke'
prompt = f'System: {system} | Response: {response} | Label: {label}'
answer = evaluate_response(client, prompt)
answer.choices[0].message.content

'No'