<a href="https://colab.research.google.com/github/GlassesNoGlasses/TFProjects/blob/ethnicity_classifier/projects/classification/ethnicity/Ethnicity_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# Clone repo

!git clone https://github.com/GlassesNoGlasses/TFProjects.git

fatal: destination path 'TFProjects' already exists and is not an empty directory.


In [41]:
# Reclone repo

!rm -r TFProjects/
!git clone https://github.com/GlassesNoGlasses/TFProjects.git

Cloning into 'TFProjects'...
remote: Enumerating objects: 1770, done.[K
remote: Counting objects: 100% (1770/1770), done.[K
remote: Compressing objects: 100% (1492/1492), done.[K
remote: Total 1770 (delta 284), reused 1741 (delta 272), pack-reused 0[K
Receiving objects: 100% (1770/1770), 21.04 MiB | 28.50 MiB/s, done.
Resolving deltas: 100% (284/284), done.


In [42]:
# Add required imports

import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [43]:
# File locations/directories

import os

# List of all ethnicities
ethnicities_subdirs = ['Caucasian', 'Asian', 'African']

# paths to training/testing dirs
test_data_path = "/content/TFProjects/projects/classification/ethnicity/Test"
train_data_path = "/content/TFProjects/projects/classification/ethnicity/Train"

# Return list of all files in a dir given path
def GetFiles(path: str) -> list[str]:
  if path == None or len(path) == 0:
    return []

  return os.listdir(path)

In [None]:
for ethnicity in ethnicities_subdirs:
  print(ethnicity)
  print(GetFiles(train_data_path + "/" + ethnicity))

Caucasian
['George_P_Bush_0002.jpg', '63.JPG', 'b29.JPG', '64 - Copy.JPG', 'Tony_Blair_0020.jpg', 'Matt_Morris_0001.jpg', '77.JPG', 'Jeff_George_0001.jpg', '131.JPG', 'George_W_Bush_0038.jpg', 'Neil_Goldman_0001.jpg', 'Kent_Robinson_0001.jpg', 'Amy_Smart_0001.jpg', 'b46.JPG', 'William_Macy_0002.jpg', 'Tony_Blair_0114.jpg', '86.JPG', 'Jennifer_Lopez_0019.jpg', '62.JPG', 'Thomas_Franklin_0001.jpg', 'Baz_Luhrmann_0001.jpg', '73.JPG', 'Kenny_Brack_0001.jpg', '117.JPG', '167.JPG', '115.JPG', 'Angela_Mascia-Frye_0001.jpg', 'b45.JPG', 'Rob_Marshall_0001.jpg', 'Rob_Marshall_0002.jpg', 'Kenneth_Brill_0001.jpg', '155.JPG', '110.JPG', 'Tony_Blair_0102.jpg', 'Robert_Bullock_0002.jpg', 'person_0618.jpg', 'Raymond_Arthurs_0001.jpg', '119.JPG', '143.JPG', 'Paul_Bremer_0013.jpg', '109.JPG', '25.JPG', '933.JPG', 'Greg_Gilbert_0002.jpg', 'person_0011.jpg', '113.JPG', 'person_0344.jpg', 'person_0976.jpg', 'Britney_Spears_0011.jpg', '917.JPG', '149.JPG', 'James_May_0001.jpg', 'Brad_Johnson_0003.jpg', 'Sha

# Converting to NumPy:

We need the data to be in a processable form. Therefore, we shall convert each ethnicity into a NumPy array: [image_id, ethnicity_id].

After, we will convert NumPy arrays to tensors for TensorFlow.

In [44]:
# Convert ethnicities to numbers for easier processing

ethnicity_to_id = {'Unknown': 0, 'African': 1, 'Asian': 2, 'Caucasian': 3}
id_to_ethnicity = {0: 'Unknown', 1: 'African', 2: 'Asian', 3: 'Caucasian'}

# Label images with label. Return NumPy matrix (len(image_list, 2))
def LabelImages(image_list, label):
  label_list = np.full((len(image_list)), ethnicity_to_id[label])
  return np.stack((np.array(image_list), label_list), axis=1)


In [45]:
for ethnicity in ethnicities_subdirs:
  print(ethnicity)
  print(np.shape(LabelImages(GetFiles(train_data_path + "/" + ethnicity), ethnicity)))

Caucasian
(473, 2)
Asian
(662, 2)
African
(517, 2)


In [62]:
# Create giant dataset

def GetAllLabeledImages():
  # Set first ethnicity
  data = LabelImages(GetFiles(train_data_path + "/" + ethnicities_subdirs[0]), ethnicities_subdirs[0])

  for i in range(1, len(ethnicities_subdirs)):
    labeled_images = LabelImages(GetFiles(train_data_path + "/" + ethnicities_subdirs[i]),
                                ethnicities_subdirs[i])
    data = np.concatenate((data, labeled_images), axis=0)

  return data

In [65]:
data = GetAllLabeledImages()

print(data)
print(data.shape)

[['George_P_Bush_0002.jpg' '3']
 ['63.JPG' '3']
 ['b29.JPG' '3']
 ...
 ['Benedita_da_Silva_0001.jpg' '1']
 ['Charmaine_Crooks_0001.jpg' '1']
 ['Collis_Temple_III_0001.jpg' '1']]
(1652, 2)
