In [19]:
import os
from PIL import Image
import pandas as pd

In [5]:
def save_csv(identity_selection, save_path, file_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    identity_selection.to_csv(save_path + file_path, index=False)

In [6]:
import random
def get_data_uniformly(data, num_image=10):
    random_choce = random.sample(data, num_image)
    return random_choce

In [7]:
def get_balanced_data(folder_path,race, race_label, image_per_id= 10):
    coloumns = ['image_name', 'id', 'race']
    race_folder = os.path.join(folder_path, race)
    balanced_dataset = []

    for id_folder in os.listdir(race_folder):
        all_id_image = []

        if not 'm.' in id_folder:
            continue
        id_path = os.path.join(race_folder, id_folder)
        id = id_folder.split("/")[0]

        num_image = 0

        for image in os.listdir(id_path):
            num_image += 1
        if num_image < image_per_id:
            continue
        else:
            for image in os.listdir(id_path):
                image_path = os.path.join(id_path, image)
                all_id_image.append(image)
        uniform_subset = get_data_uniformly(all_id_image, image_per_id)

        for i, image in enumerate(uniform_subset):
            image_name = image
            balanced_dataset.append([image_name, id, race_label])
    balanced_dataset = pd.DataFrame(balanced_dataset, columns=coloumns)
    return balanced_dataset

In [8]:
def pick_identity_uniformly(data, num_id=3000):
    id_list = data['id'].unique()
    random_id_list = random.sample(list(id_list), num_id)
    #get all data from random_id_list
    random_choice = data[data['id'].isin(random_id_list)]
    random_choice = random_choice.reset_index(drop=True)
    return random_choice

In [9]:
def get_id_select(folder_path, race, race_label, image_per_id=10, num_id=3000):
    print("Selecting IDs from", race)
    balanced_data = get_balanced_data(folder_path, race, race_label, image_per_id)
    identity_selection = pick_identity_uniformly(balanced_data, num_id)
    return identity_selection

In [10]:
def mark_train_test(identity_selection, num_test_per_id=2):
    result_df = identity_selection.copy()
    result_df['split'] = 'train'
    unique_ids = identity_selection['id'].unique()

    for id in unique_ids:
        test_indices = identity_selection[identity_selection['id'] == id].head(num_test_per_id).index
        result_df.loc[test_indices, 'split'] = 'test'
    return result_df

In [11]:
from tqdm import tqdm 

def save_train_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    #make a copy of the identity_selection
    identity_selection = identity_selection.copy()
    identity_selection = identity_selection[identity_selection['split'] == 'train']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)
        identity_save_path = os.path.join(save_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name.replace('.jpg', '.png'))
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        # Open the image and save it in PNG format
        with Image.open(source_path) as img:
            img.save(destination_path, "PNG")

    print('Finished copying and converting images to folder')

In [17]:
'''
data save to each race folder
'''
def save_test_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)    

    identity_selection = identity_selection.copy()
    identity_selection = identity_selection[identity_selection['split'] == 'test']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)
        #save to each race folder
        race_path = os.path.join(save_path, race)

        if not os.path.exists(race_path):
            os.makedirs(race_path)

        identity_save_path = os.path.join(race_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name.replace('.jpg', '.png'))
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        # Open the image and save it in PNG format
        with Image.open(source_path) as img:
            img.save(destination_path, "PNG")

    print('Finished copying and converting images to folder')
    

In [12]:
def save_id_csv(data_folder_path, race_to_label, save_path, image_per_id=10, num_id=3000):
    race_id_select = pd.DataFrame()
    data_folder_path = data_folder_path + '/'

    for race in race_to_label.keys():

        race_label=race_to_label[race]

        id_select = get_id_select(data_folder_path, race, race_label, image_per_id, num_id)
        
        #mark train and test
        id_select = mark_train_test(id_select)
        
        race_id_select = pd.concat([race_id_select, id_select])

    #save race_id_select to csv
    save_csv(race_id_select, save_path+'/train', 'data.csv')

In [13]:
#TODO: move data to cur folder
data_folder_path = '/Users/nina/Desktop/ore_code/race_per_7000'
save_path = './data/'
image_per_id = 10
num_id = 3000

race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

In [None]:
#only when csv does not exist
save_id_csv(data_folder_path, race_to_label, save_path, image_per_id, num_id)
race_id_select = pd.read_csv('./data/data.csv')

#Main Function

In [3]:
#check overlap images
data = pd.read_csv('./data/data.csv')
#concate image_name and id 
data['image_name'] = data['image_name'].astype(str)
data['id'] = data['id'].astype(str)
data['image_id'] = data['image_name'] + data['id']
#check overlap

train_image = data[data['split'] == 'train']['image_id']
test_image = data[data['split'] == 'test']['image_id']

overlap = set(train_image).intersection(set(test_image))
print(len(overlap))

0


In [16]:
#don't run unless making data folder
# save_train_image(race_id_select, data_folder_path, save_path+'/train', label_to_race)
# save_test_image(race_id_select, data_folder_path, save_path+'/test', label_to_race)

Processing Images: 100%|██████████| 48000/48000 [45:27<00:00, 17.60it/s] 

Finished copying and converting images to folder





NameError: name 'save_test_image' is not defined

In [None]:
'''
first get filtered_data csv with id > 10  --> csv
then get balanced data csv (include trian and test) --> balanced_csv, creates the same cvs from filtered, but add a balanced colunm that show if the id was selected for balanced
then get 4 unblanced data csv based on balanced_csv
    - exclude all in balanced 
    - one function to get data based on distribution 
    - save each as csv (4 csv)
    - mark train and test in each biased csv 

just CSV, check overlap, then move data to folder
'''

In [12]:

race_to_label = {'Caucasian': 0, 'African': 1}
train_data = data[data['split'] == 'train']
    # Create a dictionary to store dataframes for each race
race_dataframes = {}

# Create a dictionary to store dataframes for each race

# Separate the dataframes based on the 'race' column using a for loop
for race, label in race_to_label.items():
    race_dataframes[race] = train_data[train_data['race'] == label]


In [14]:
c_df = race_dataframes['Caucasian']
unique_id_count = c_df['id'].nunique()
print(f"Number of unique IDs: {unique_id_count}")


Number of unique IDs: 3000


In [15]:
aa_df = race_dataframes['African']
unique_id_count = c_df['id'].nunique()
print(f"Number of unique IDs: {unique_id_count}")


Number of unique IDs: 3000


In [16]:
len(aa_df)

24000

In [60]:

def sample_data(race_data, id_counts, num_ids, num_images):
    # Sample ids
    sample_ids = id_counts[id_counts > 0].sample(num_ids).index

    # Get all images for the sampled ids
    sample = race_data[race_data['id'].isin(sample_ids)]

    # If more than num_images images, sample down to num_images
    if len(sample) > num_images:
        sample = sample.sample(num_images)

    # Check if all sampled IDs have at least one image
    while sample['id'].nunique() < len(sample_ids):
        missing_ids = set(sample_ids) - set(sample['id'].unique())
        additional_ids = id_counts[list(missing_ids)].sample(len(missing_ids)).index
        additional_images = race_data[race_data['id'].isin(additional_ids)]
        sample = pd.concat([sample, additional_images])

        if len(sample) > num_images:
            sample = sample.sample(num_images)

    return sample

def generate_unbalanced_datasets(filtered_csv, balanced_data, output_dir):
    race_to_label = {'Caucasian': 0, 'African': 1}

    data = pd.read_csv(filtered_csv)
    train_data = data[data['split'] == 'train']

    unbalanced_overall = train_data.copy()

    for race, label in race_to_label.items():
        majority_race = unbalanced_overall[unbalanced_overall['race'] == label]
        minority_race = unbalanced_overall[unbalanced_overall['race'] != label]

        # Get unique IDs and their counts
        id_counts_majority = majority_race['id'].value_counts()
        id_counts_minority = minority_race['id'].value_counts()

        # Sample data from majority and minority
        majority_sample = sample_data(majority_race, id_counts_majority, 2000, 7000)
        minority_sample = sample_data(minority_race, id_counts_minority, 1000, 3000)

        # Combine majority and minority race data.
        unbalanced_data = pd.concat([majority_sample, minority_sample])
        # Sort by IDs
        unbalanced_data = unbalanced_data.sort_values('id')
        # Save the unbalanced data as CSV.
        unbalanced_data.to_csv(f'{output_dir}/unbalanced_train_{race}.csv', index=False)




In [61]:
generate_unbalanced_datasets('./data/data.csv', None, "./output" )

In [64]:
import pandas as pd
import plotly.graph_objects as go

def plot_biased_distribution(csv_path):
    # Load the data
    data = pd.read_csv(csv_path)

    # Check for duplicates
    duplicates = data.duplicated()
    print(f"Number of duplicate rows: {duplicates.sum()}")

    # Map the race codes to their actual names (modify this according to your dataset)
    race_mapping = {0: 'Caucasian', 1: 'African'}
    data['race'] = data['race'].map(race_mapping)

    # Count the number of images for each race
    race_counts = data['race'].value_counts().reset_index()

    # Rename the columns
    race_counts.columns = ['Race', 'Number of Images']

    # Count the number of unique IDs for each race
    id_counts = data.groupby('race')['id'].nunique().reset_index()

    # Rename the columns
    id_counts.columns = ['Race', 'Number of IDs']

    # Create a grouped bar chart
    fig = go.Figure(data=[
        go.Bar(name='Number of Images', x= race_counts['Race'], y= race_counts['Number of Images']),
        go.Bar(name='Number of IDs', x= id_counts['Race'], y= id_counts['Number of IDs'])
    ])

    # Change the bar mode
    fig.update_layout(barmode='group', title='Distribution of Images and Unique IDs by Race')
    
    fig.show()




In [65]:
# Call the function with the path to your CSV file
plot_biased_distribution('output/unbalanced_train_African.csv')

Number of duplicate rows: 0
