In [16]:
import random
import pandas as pd
import os  
from tqdm import tqdm
import numpy as np
import shutil

In [9]:
def save_csv(identity_selection, save_path, file_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    identity_selection.to_csv(save_path + file_path, index=False)

Get all data

In [None]:
import pandas as pd
import os
def get_all_data_csv(data_path):
    '''get csv of all data, race, id, image_name'''
    coloumns = ['image_name', 'id', 'race']
    print(data_path)

    df = pd.DataFrame(columns=coloumns)
    all_image = []
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

    for race in race_to_label.keys():
        race_folder = os.path.join(data_path+'/', race)
        print(race_folder)
        race_label = race_to_label[race]
        for id_folder in os.listdir(race_folder):
            if not 'm.' in id_folder:
                continue
            all_id_image = []

            id_path = os.path.join(race_folder, id_folder)
            id = id_folder.split("/")[0]

            for image in os.listdir(id_path):
                all_id_image.append(image)
            for image in all_id_image:
                all_image.append([image, id, race_label])
        
    df = pd.DataFrame(all_image, columns=coloumns)

    return df

In [None]:
all_data_csv = get_all_data_csv("./race_per_7000")

Get balanced data

In [None]:
def get_balanced_per_race(race_df, image_per_id_limit=60, total_img=30000, id_per_race=500):
    '''
    get a new df with 60 image per id
    draw id_per_race uniformly 
    draw image_per_id uniformly
    '''
    # Group by ID and count the number of images per ID
    data_grouped = race_df.groupby("id").count()
    
    # Filter IDs with at least 'image_per_id_limit' images
    data_grouped = data_grouped[data_grouped["image_name"] >= image_per_id_limit]
    
    # Get a list of IDs that meet the image count criteria
    eligible_ids = data_grouped.index.tolist()
    
    # Shuffle the list of eligible IDs
    random.shuffle(eligible_ids)
    
    # Take the first 'id_per_race' IDs to ensure uniform distribution
    selected_ids = eligible_ids[:id_per_race]
    
    # Filter the DataFrame to include only rows with selected IDs
    filtered_id_df = race_df[race_df["id"].isin(selected_ids)]
    
    # Randomly sample images from each selected ID to meet the 'total_img' requirement
    num_selected_images = 0
    selected_rows = []
    
    for id in selected_ids:
        id_df = filtered_id_df[filtered_id_df["id"] == id]
        num_images_for_id = min(image_per_id_limit, total_img - num_selected_images)
        
        # Randomly sample 'num_images_for_id' images for the current ID
        sampled_rows = id_df.sample(n=num_images_for_id, random_state=42)
        selected_rows.extend(sampled_rows.values)
        num_selected_images += num_images_for_id
        
        if num_selected_images >= total_img:
            break
    
    # Create a new DataFrame from the selected rows
    new_df = pd.DataFrame(selected_rows, columns=['image_name', 'id', 'race'])
    
    return new_df

In [None]:
def get_balanced(data, image_per_id_limit=60, total_image=120000, id_per_race=500): 
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

    balanced_data = pd.DataFrame()

    for race in race_to_label.keys():
        race_label = race_to_label[race]
        cur_race_data = data[data['race']==race_label]
        race_balanced_df = get_balanced_per_race(cur_race_data, image_per_id_limit, total_image, id_per_race)
        balanced_data = pd.concat([balanced_data, race_balanced_df])
    
    return balanced_data

In [None]:
data = pd.read_csv("all_data.csv")

In [None]:
balanced_data = get_balanced(data, 60, 120000, 500)

In [None]:
print(len(balanced_data))
print(balanced_data.head())
print(balanced_data.groupby('id').count()['image_name'])
print(balanced_data.groupby('race').count()['image_name'])

In [None]:
save_csv(balanced_data, "./", "balanced_data.csv")

Filter out balanced data and get disjoint_balanced_data

In [None]:
balanced_data = pd.read_csv("./balanced_data.csv")
data = pd.read_csv("./all_data.csv")
#filter out everything in balanced_data
filtered_data = data[~data['id'].isin(balanced_data['id'])]

#filter out id with less than 20 images
filtered_data = filtered_data.groupby('id').filter(lambda x: len(x) >= 20)

#check number of id per race
id_per_race = filtered_data.groupby('race')['id'].nunique()

In [None]:
save_csv(filtered_data, "./", "disjoint_balanced_data.csv")

Sample biased datasets for each race

In [None]:
import random

def sample_by_race(race_data, id_per_race, image_per_id):
    total_img = id_per_race * image_per_id

    # Group by ID and count the number of images per ID
    data_grouped = race_data.groupby("id").count()
    
    # Get a list of IDs that meet the image count criteria
    eligible_ids = data_grouped.index.tolist()
    
    # Shuffle the list of eligible IDs
    random.shuffle(eligible_ids)
    
    # Take the first 'id_per_race' IDs to ensure uniform distribution
    selected_ids = eligible_ids[:id_per_race]
    
    # Filter the DataFrame to include only rows with selected IDs
    filtered_id_df = race_data[race_data["id"].isin(selected_ids)]
    
    # Randomly sample images from each selected ID to meet the 'total_img' requirement
    num_selected_images = 0
    selected_rows = []
    
    for id in selected_ids:
        id_df = filtered_id_df[filtered_id_df["id"] == id]
        num_images_for_id = min(image_per_id, total_img - num_selected_images)
        
        # Randomly sample 'num_images_for_id' images for the current ID
        sampled_rows = id_df.sample(n=num_images_for_id, random_state=42)
        selected_rows.extend(sampled_rows.values)
        num_selected_images += num_images_for_id
        
        if num_selected_images >= total_img:
            break
    
    # Create a new DataFrame from the selected rows
    new_df = pd.DataFrame(selected_rows, columns=['image_name', 'id', 'race'])

    return new_df

def generate_unbalanced_datasets(filtered_csv, output_dir, majority_num_ids=3600, minority_num_ids=780, image_per_id=20):
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}
    
    filtered_data = pd.read_csv(filtered_csv)
    coloumns = ['image_name', 'id', 'race']

    for race, label in race_to_label.items():
        cur_label = label
        cur_biased_data = pd.DataFrame(columns=coloumns)

        for iter_label, _ in label_to_race.items():
            if iter_label == cur_label:
                majority_race = filtered_data[filtered_data['race'] == cur_label]
                majority_sample = sample_by_race(majority_race, majority_num_ids, image_per_id)
                cur_biased_data = pd.concat([cur_biased_data, majority_sample])
            else:
                minority_sample = filtered_data[filtered_data['race'] == iter_label]
                minority_sample = sample_by_race(minority_sample, minority_num_ids, image_per_id)
                cur_biased_data = pd.concat([cur_biased_data, minority_sample])

        # Save the unbalanced data as CSV.
        cur_biased_data.to_csv(f'{output_dir}/unbalanced_{race}.csv', index=False)

In [None]:
generate_unbalanced_datasets("./disjoint_balanced_data.csv", "./", majority_num_ids=3600, minority_num_ids=780, image_per_id=20)

Check data leakage

In [None]:

african_unbalnced = pd.read_csv("./unbalanced_African.csv")
asian_unbalnced = pd.read_csv("./unbalanced_Asian.csv")
caucasian_unbalnced = pd.read_csv("./unbalanced_Caucasian.csv")
indian_unbalnced = pd.read_csv("./unbalanced_Indian.csv")

balanced_data = pd.read_csv("./balanced_data.csv")

In [None]:
print(african_unbalnced.groupby('id').count()['image_name'].value_counts())
print(asian_unbalnced.groupby('id').count()['image_name'].value_counts())
print("Caucasian unbalanced", caucasian_unbalnced.groupby('id').count()['image_name'].value_counts())
print("Indian unbalanced", indian_unbalnced.groupby('id').count()['image_name'].value_counts())

In [None]:
african_unbalnced.groupby('race')['id'].nunique()

In [None]:
#check identity leakage
african_unbalnced_id = african_unbalnced['id'].tolist()
asian_unbalnced_id = asian_unbalnced['id'].tolist()
caucasian_unbalnced_id = caucasian_unbalnced['id'].tolist()
indian_unbalnced_id = indian_unbalnced['id'].tolist()

balanced_data_id = balanced_data['id'].tolist()

print("number of identity overlap between african_unbalnced and balanced_data: ", len(set(african_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between asian_unbalnced and balanced_data: ", len(set(asian_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between caucasian_unbalnced and balanced_data: ", len(set(caucasian_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between indian_unbalnced and balanced_data: ", len(set(indian_unbalnced_id).intersection(balanced_data_id)))

Get Train and Test in Balanced


In [13]:
def mark_train_test(data, percentage_test_per_id=0.2):
    image_per_id = balanced_data.groupby('id').count()['image_name'][0]
    num_test_per_id = (image_per_id * percentage_test_per_id).astype(int)

    result_df = data.copy()
    result_df['split'] = 'train'
    unique_ids = data['id'].unique()

    for id in unique_ids:
        test_indices = data[data['id'] == id].head(num_test_per_id).index
        result_df.loc[test_indices, 'split'] = 'test'
    return result_df

In [14]:
#ignor the warning, it works fine 

balanced_data = pd.read_csv("./balanced_data.csv")
african_balanced = balanced_data[balanced_data['race'] == 3]
african_balanced_split = mark_train_test(african_balanced, 0.2)

save_csv(african_balanced_split, "./", "african_balanced_split.csv")

  image_per_id = balanced_data.groupby('id').count()['image_name'][0]


Save train and test image

In [17]:
def save_train_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # Make a copy of the identity_selection
    identity_selection = identity_selection.copy()
    if 'split' in identity_selection.columns:
        identity_selection = identity_selection[identity_selection['split'] == 'train']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)
        identity_save_path = os.path.join(save_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name)
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        # Copy the image without converting
        shutil.copy(source_path, destination_path)

    print('Finished copying training images to folders in JPG format')

In [25]:
def save_test_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)    

    identity_selection = identity_selection.copy()
    
    if 'split' in identity_selection.columns:
        identity_selection = identity_selection[identity_selection['split'] == 'test']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)

        # Save to each race folder
        race_path = os.path.join(save_path, race)
        if not os.path.exists(race_path):
            os.makedirs(race_path)

        identity_save_path = os.path.join(race_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name)
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        # Copy the image without converting
        shutil.copy(source_path, destination_path)

    print('Finished copying images to folders in JPG format')

In [26]:
#just saving one race for now
data = pd.read_csv("./balanced_data_split.csv")
african_balanced_split = data[data['race']==3]
caucasian_balanced_split = data[data['race']==0]

african_and_cau = pd.concat([african_balanced_split, caucasian_balanced_split])


data_folder_path = "./race_per_7000"
save_path = "./"


race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

save_train_image(african_and_cau, data_folder_path, save_path+'/train', label_to_race)
save_test_image(african_and_cau, data_folder_path, save_path+'/test', label_to_race)

Processing Images: 100%|██████████| 48000/48000 [00:03<00:00, 14421.37it/s]


Finished copying training images to folders in JPG format


Processing Images: 100%|██████████| 12000/12000 [00:00<00:00, 14139.97it/s]

Finished copying images to folders in JPG format





#just getting mvp mini data

In [28]:
african_unbalanced = pd.read_csv("./unbalanced_African.csv")

#take first 50 id from african and first 10 caucasian 
first_50_african_uniq_id = african_unbalanced[african_unbalanced['race']==3]['id'].unique()[:50]
african_filter = african_unbalanced[african_unbalanced['id'].isin(first_50_african_uniq_id)]

first_10_cau_uniq_id = african_unbalanced[african_unbalanced['race']==0]['id'].unique()[:10]
cau_filter = african_unbalanced[african_unbalanced['id'].isin(first_10_cau_uniq_id)]

#combine
data = pd.concat([african_filter, cau_filter])


In [29]:

#get dataset
#just saving one race for now

data_folder_path = "./race_per_7000"
save_path = "./"


race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

save_train_image(data, data_folder_path, save_path+'/con_train_mini', label_to_race)
# save_test_image(data, data_folder_path, save_path+'/test', label_to_race)

KeyError: 'split'