# CSV Evaluator

### Environmental Setup

Import necessary modules.

In [3]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Define helper functions

In [7]:
def calculate_scores(y_true, y_pred, round_digits=2):
    precision = round(precision_score(y_true, y_pred, zero_division=0), round_digits)
    recall    = round(recall_score(y_true, y_pred, zero_division=0), round_digits)
    f1        = round(f1_score(y_true, y_pred, zero_division=0), round_digits)
    accuracy  = round(accuracy_score(y_true, y_pred), round_digits)
    
    return precision, recall, f1, accuracy

# Custom sorting function
def custom_sort(df):
    # Assign a priority column for sorting
    df['priority'] = df.apply(
        lambda row: 0 if row['Country'] == row['Race'] else 1 
                    if row['Race'] != 'total' else 2, axis=1
    )
    # Sort by Country, then by priority, and reset index
    df = df.sort_values(by=['Country', 'priority']).drop(columns='priority').reset_index(drop=True)
    return df

def highlight_row(row):
    if row['Country'] == row['Race']:
        return ['background-color: #690150'] * len(row) # data for original images are red
    
    if row['Race'] == 'total':
        return ['background-color: #016950'] * len(row) # data for total images are green
    
    return [''] * len(row)

### Cultural Identification Metrics

In [120]:
def cultural_identification_metrics(file_path, race_verbose=True, misclassified_verbose=False):
    # 1. Load the data
    data = pd.read_csv(file_path)
    
    # 2. Create 'Match' column. Match if `original_country` is mentioned in the `response`
    data['Match_country'] = data.apply(
        lambda row: 1 if any(term.lower().strip() in row['response'].lower() for term in row['original_country'].split(',')) else 0, 
        axis=1
    )
    
    # 3. Normalize the text of both `original_country` to ensure consistent grouping (e.g., remove duplicates, ordering effect, etc.)
    data['original_country'] = data['original_country'].apply(lambda x: ', '.join(sorted(set(term.strip() for term in x.split(',')))))        

    # Get unique combinations of country and race
    unique_combinations = data.groupby(['original_country', 'synthesized_race']).size().reset_index()
    unique_combinations.columns = ['original_country', 'synthesized_race', 'count']
    
    unique_countries = data['original_country'].unique()
    
    # Initialize results
    country_results = []
    misclassified_images = []
        
    # Add a row for the 'total'
    for country in unique_countries:
        # if there is only one synthesized_race for a country in unique_combinations, then skip adding 'total' row
        if len(unique_combinations[unique_combinations['original_country'] == country]) > 1:
            new_row = {'original_country': country, 'synthesized_race': 'total', 'count': 0}
            unique_combinations = pd.concat([unique_combinations, pd.DataFrame([new_row])], ignore_index=True)
    
    # Compute metrics for each country-race combination
    for index, row in unique_combinations.iterrows():
        country = row['original_country']
        race = row['synthesized_race']
        
        if race == 'total':
            subset = data[(data['original_country'] == country) & (data['original_country'] != data['synthesized_race'])]
        else:
            subset = data[(data['original_country'] == country) & (data['synthesized_race'] == race)]
        
        y_true = [1] * len(subset)  # Ground truth: should mention the country
        y_pred = subset['Match_country'].tolist()

        precision, recall, f1, accuracy = calculate_scores(y_true, y_pred)                

        # Identify misclassified samples
        if race != 'total':
            misclassified_indices = [i + (row['count'] * index) for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]  
            misclassified_images.extend(data.iloc[misclassified_indices]['image_file_name'].tolist())
                
        country_results.append({
            'Country': country,
            'Race': race,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Accuracy': accuracy,
            'Correct Samples': sum(y_pred),    # sum of 1s
            'Total Samples': len(subset)      # total rows in this group
        })

    if misclassified_verbose:
            print(f"Misclassified images: {misclassified_images}")
            misclassified_data = data[data['image_file_name'].isin(misclassified_images)]
            misclassified_indices = misclassified_data.index.tolist()
            print(f"Misclassified indices: {misclassified_indices}")
            print(f"Misclassified data:\n {misclassified_data}")
                
    # Convert results to a DataFrame and sort
    country_results_df = custom_sort(pd.DataFrame(country_results))
    
    # # Apply highlighting and ensuring truncation
    country_results_df = country_results_df.style.apply(highlight_row, axis=1).format(precision=2)
    
    # only select rows with the races being 'total'
    if not race_verbose:
        country_results_df = country_results_df[country_results_df['Race'] == 'total']
                

    return country_results_df

You can provide a csv file with response data for
- original images of a single country/ multiple countries (eg. `images/Korea_Original_Food_Results.csv`)
- synthesized images of a single country/ multiple countries (eg. `responses/internVL/internVL_master_synthesized_food.csv`)
- original + synthesized images of a single country/multiple countries (eg. `responses/gpt4o/gpt4o_master_clothes.csv`)

Rows for original iamges are highlighted red, and rows for total are highlighted green.

In [121]:
# file_path = '../responses/gpt4o/gpt4o_master_clothes.csv'
file_path = '../responses/internVL/internVL_master_synthesized_food.csv'
# file_path = '../images/Korean_Food/Korea_Synthesized_Food_Results.csv'
results_df = cultural_identification_metrics(file_path, race_verbose=True, misclassified_verbose=False)
results_df

Unnamed: 0,Country,Race,Precision,Recall,F1 Score,Accuracy,Correct Samples,Total Samples
0,Korea,Asian,1.0,0.85,0.92,0.85,17,20
1,Korea,Black,1.0,0.8,0.89,0.8,16,20
2,Korea,Indian,1.0,0.75,0.86,0.75,15,20
3,Korea,White,1.0,0.8,0.89,0.8,16,20
4,Korea,total,1.0,0.8,0.89,0.8,64,80
5,Myanmar,Asian,1.0,0.05,0.1,0.05,1,20
6,Myanmar,Black,1.0,0.05,0.1,0.05,1,20
7,Myanmar,Indian,1.0,0.05,0.1,0.05,1,20
8,Myanmar,White,1.0,0.05,0.1,0.05,1,20
9,Myanmar,total,1.0,0.05,0.1,0.05,4,80


### Label Identification Metrics

In [122]:
def food_identification_metrics(file_path, race='original', misclassified_verbose=False):        
        data = pd.read_csv(file_path)
        assert race in ['original', 'Asian', 'Black', 'White', 'Indian', 'Caucasian'], "Invalid race value. Choose from 'original', 'Asian', 'Black', 'White', 'Indian', 'Caucasian'"
        
        # drop rows whose race doesn't match with race parameter
        if race == 'original':
                data = data[data['synthesized_race'] == data['original_country']]
        else:
                data = data[data['synthesized_race'] == race]

        try:
                data['Match_label'] = data.apply(
                                lambda row: 1 if any(term.lower().strip() in row['response'].lower() for term in row['label'].split(',')) else 0, 
                                axis=1
                        )

                data['label'] = data['label'].apply(lambda x: ', '.join(sorted(set(term.strip() for term in x.split(',')))))
        except KeyError:
                print(f"Label column not found in the given response csv file. Ensure that the file is for food data and there is a column named 'label'.")
                return None

        label_results = []
        misclassified_images = []
        unique_labels = data['label'].unique()

        for lbl in unique_labels:
                label_data = data[data['label'] == lbl]
                y_true = [1] * len(label_data)  # ground truth: should mention the label
                y_pred = label_data['Match_label'].tolist()

                precision, recall, f1, accuracy = calculate_scores(y_true, y_pred)

                # Identify misclassified samples
                misclassified_indices = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true != pred]
                
                # get the file image names of misclassified samples
                misclassified_images.extend(label_data.iloc[misclassified_indices]['image_file_name'].tolist())
                
                label_results.append({
                'Label': lbl,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'Accuracy': accuracy,
                'Correct Samples': sum(y_pred),
                'Total Samples': len(label_data)
                })
        
        if misclassified_verbose:
                misclassified_data = data[data['image_file_name'].isin(misclassified_images)]
                misclassified_indices = misclassified_data.index.tolist()
                print(f"Misclassified indices: {misclassified_indices}")
                print(f"Misclassified data:\n {misclassified_data}")
        # Convert label results to a DataFrame
        label_results_df = pd.DataFrame(label_results)

        # Calculate totals using pandas methods
        totals = pd.Series({
                'Label': 'total',
                'Precision': (label_results_df['Correct Samples'].sum() / label_results_df['Total Samples'].sum()),
                'Recall': (label_results_df['Correct Samples'].sum() / label_results_df['Total Samples'].sum()),
                'F1 Score': (label_results_df['Correct Samples'].sum() / label_results_df['Total Samples'].sum()),  # Same as precision/recall in this case
                'Accuracy': (label_results_df['Correct Samples'].sum() / label_results_df['Total Samples'].sum()),
                'Correct Samples': label_results_df['Correct Samples'].sum(),
                'Total Samples': label_results_df['Total Samples'].sum()
        })

        # Append totals row to the DataFrame
        label_results_df = pd.concat([label_results_df, pd.DataFrame([totals])], ignore_index=True)
        
        return label_results_df

In [123]:
file_path = '../responses/gpt4o/Myanmar_Food_Results.csv'
label_df = food_identification_metrics(file_path, race='original', misclassified_verbose=True)
label_df

Misclassified indices: [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 19]
Misclassified data:
    original_country synthesized_race      image_file_name  \
1           Myanmar          Myanmar   Myanmar_food_1.png   
2           Myanmar          Myanmar   Myanmar_food_2.png   
4           Myanmar          Myanmar   Myanmar_food_4.png   
5           Myanmar          Myanmar   Myanmar_food_5.png   
6           Myanmar          Myanmar   Myanmar_food_6.png   
7           Myanmar          Myanmar   Myanmar_food_7.png   
8           Myanmar          Myanmar   Myanmar_food_8.png   
9           Myanmar          Myanmar   Myanmar_food_9.png   
10          Myanmar          Myanmar  Myanmar_food_10.png   
11          Myanmar          Myanmar  Myanmar_food_11.png   
12          Myanmar          Myanmar  Myanmar_food_12.png   
16          Myanmar          Myanmar  Myanmar_food_16.png   
17          Myanmar          Myanmar  Myanmar_food_17.png   
19          Myanmar          Myanmar  Myanmar_food_19

Unnamed: 0,Label,Precision,Recall,F1 Score,Accuracy,Correct Samples,Total Samples
0,"Burmese cuisine, Burmese curry, Myanmar cuisin...",1.0,0.5,0.67,0.5,1,2
1,"Laphet Thoke, Tea leaf salad",1.0,0.33,0.5,0.33,1,3
2,"Mohinga, fish broth, fish soup, fish-based bro...",1.0,0.38,0.55,0.38,3,8
3,"Dote Htoe, Wat Thar Dote Htoe, pork offal skew...",0.0,0.0,0.0,0.0,0,4
4,"Ohn No Khao Swe, coconut curry, coconut curry ...",1.0,0.5,0.67,0.5,1,2
5,"Khao Soi Thoke, Nan Gyi Thoke, Noodle salad",0.0,0.0,0.0,0.0,0,1
6,total,0.3,0.3,0.3,0.3,6,20
