# Creating a new Dataset from Cifar 100

First I will store the `test` and `train` data in `pd.DataFrame` for ease of manipulation.

In [86]:
from datasets import load_dataset
import pandas as pd 

cifar = load_dataset('cifar100')
cifar_test = pd.DataFrame(cifar['test'])
cifar_train = pd.DataFrame(cifar['train'])

cifar_train
 

Unnamed: 0,img,fine_label,coarse_label
0,<PIL.PngImagePlugin.PngImageFile image mode=RG...,19,11
1,<PIL.PngImagePlugin.PngImageFile image mode=RG...,29,15
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0,4
3,<PIL.PngImagePlugin.PngImageFile image mode=RG...,11,14
4,<PIL.PngImagePlugin.PngImageFile image mode=RG...,1,1
...,...,...,...
49995,<PIL.PngImagePlugin.PngImageFile image mode=RG...,80,16
49996,<PIL.PngImagePlugin.PngImageFile image mode=RG...,7,7
49997,<PIL.PngImagePlugin.PngImageFile image mode=RG...,3,8
49998,<PIL.PngImagePlugin.PngImageFile image mode=RG...,7,7


I will then join the two `DataFrame` to form the new dataset.

In [87]:
new_dataset = pd.concat([cifar_train, cifar_test])
new_dataset

Unnamed: 0,img,fine_label,coarse_label
0,<PIL.PngImagePlugin.PngImageFile image mode=RG...,19,11
1,<PIL.PngImagePlugin.PngImageFile image mode=RG...,29,15
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0,4
3,<PIL.PngImagePlugin.PngImageFile image mode=RG...,11,14
4,<PIL.PngImagePlugin.PngImageFile image mode=RG...,1,1
...,...,...,...
9995,<PIL.PngImagePlugin.PngImageFile image mode=RG...,83,4
9996,<PIL.PngImagePlugin.PngImageFile image mode=RG...,14,7
9997,<PIL.PngImagePlugin.PngImageFile image mode=RG...,51,4
9998,<PIL.PngImagePlugin.PngImageFile image mode=RG...,42,8


Getting the `id2label` mapping of the `coarse_label` column. And creating the new labels with a mapping between the old and the new.

In [88]:
labels = cifar['train'].features['coarse_label'].names
id2label = dict({k: v for k, v in enumerate(labels)})

# Mapping between new and old labels
new_labels_mapping = {
  'aquatic_animals': ['aquatic_mammals', 'fish'],
  'household_furniture': ['household_furniture'],
  'small_objects': ['household_electrical_devices', 'food_containers'],
  'insects': ['insects', 'non-insect_invertebrates'],
  'land_animals': ['large_carnivores', 'large_omnivores_and_herbivores', 'medium_mammals', 'small_mammals', 'reptiles'],
  'people': ['people'],
  'outdoors': ['large_man-made_outdoor_things', 'large_natural_outdoor_scenes', 'trees', 'flowers'],
  'vehicles': ['vehicles_1', 'vehicles_2'],
  'food': ['fruit_and_vegetables']
}

new_labels = list(new_labels_mapping.keys())

Creating a `new_labels` row in which we'll have ids of the new labels based on the mapping between the old and the new labels. Also dropping the `coarse_label`.

In [89]:
def map_old_labels_to_new(old_label_id):
  old_label = id2label[old_label_id]
  new_label = None
  for key, value in new_labels_mapping.items():
    if old_label in value:
      new_label = key
  return new_label

new_dataset['label'] = new_dataset['coarse_label'].map(map_old_labels_to_new)

new_dataset = new_dataset.drop(columns=['coarse_label'])
new_dataset

Unnamed: 0,img,fine_label,label
0,<PIL.PngImagePlugin.PngImageFile image mode=RG...,19,land_animals
1,<PIL.PngImagePlugin.PngImageFile image mode=RG...,29,land_animals
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0,food
3,<PIL.PngImagePlugin.PngImageFile image mode=RG...,11,people
4,<PIL.PngImagePlugin.PngImageFile image mode=RG...,1,aquatic_animals
...,...,...,...
9995,<PIL.PngImagePlugin.PngImageFile image mode=RG...,83,food
9996,<PIL.PngImagePlugin.PngImageFile image mode=RG...,14,insects
9997,<PIL.PngImagePlugin.PngImageFile image mode=RG...,51,food
9998,<PIL.PngImagePlugin.PngImageFile image mode=RG...,42,land_animals


Now we check the number of rows per label so we can balance the numbers between the labels.

In [90]:
rows_per_label = new_dataset['label'].value_counts()
rows_per_label

label
land_animals           15000
outdoors               12000
aquatic_animals         6000
small_objects           6000
vehicles                6000
insects                 6000
food                    3000
people                  3000
household_furniture     3000
Name: count, dtype: int64

As we can conclude from the previous output, we'll need to randomly get rid of some of the `land_animals` and `outdoors` rows and we'll also need to add some images for `food`, `people`, `household_furniture` and `bird` (which doesn't have any entries). The purpose of doing so is to have a balanced dataset for all the labels. 

We will start by fetching the `people` dataset that I downloaded and cleaned (at some extent) from the this [kaggle](https://www.kaggle.com/datasets/ahmadahmadzada/images2000)

In [91]:
import os 
from PIL import Image

def load_and_resize_image(file_path):
  try:
    # open image
    img = Image.open(file_path)
    img = img.resize((32, 32))
    return img
  except Exception as e:
    print(f"Error processing image {file_path} : {e}")
    return

folder_path = './datasets/people/images/images'
  
files = os.listdir(folder_path)

# Filter only image files (you may need to adjust this condition based on your image file extensions)
image_files = [file for file in files if file.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))]

# Create a DataFrame to store image paths and resized images
data = {'img': [], 'label': []}

# Iterate over image files, load, resize, and save them
for image_file in image_files:
    image_path = os.path.join(folder_path, image_file)
    resized_image = load_and_resize_image(image_path)
    if resized_image:
        data['img'].append(resized_image)
        data['label'].append('people')

new_people_data = pd.DataFrame(data)
new_people_data

Unnamed: 0,img,label
0,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
1,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
2,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
3,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
4,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
...,...,...
1740,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
1741,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
1742,<PIL.Image.Image image mode=RGB size=32x32 at ...,people
1743,<PIL.Image.Image image mode=RGB size=32x32 at ...,people


In [92]:
new_dataset = pd.concat([new_dataset, new_people_data])
new_dataset['label'].value_counts()

label
land_animals           15000
outdoors               12000
aquatic_animals         6000
small_objects           6000
vehicles                6000
insects                 6000
people                  4745
food                    3000
household_furniture     3000
Name: count, dtype: int64

In [93]:
food_folders = [
  'Bread',
  'Dairy Product',
  'Dessert',
  'Egg',
  'Fried food',
  'Meat',
  'Noodles-Pasta',
  'Rice',
  'Soup'
]

food_dataset_path = './datasets/food/training'

data = {'img': [], 'label': []}
count = 0

for food_folder in food_folders:
  if count == 3000:
    break
  food_folder_path = os.path.join(food_dataset_path, food_folder)
  files = os.listdir(food_folder_path)
  images_files = [file for file in files if file.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))]
  for image_file in images_files:
    image_path = os.path.join(food_folder_path, image_file)
    resized_image = load_and_resize_image(image_path)
    if count == 3000:
      break
    if resized_image:
      count += 1
      data['img'].append(resized_image)
      data['label'].append('food')

food_df = pd.DataFrame(data)

new_dataset = pd.concat([new_dataset, food_df])

In [94]:
new_dataset['label'].value_counts()

label
land_animals           15000
outdoors               12000
food                    6000
aquatic_animals         6000
small_objects           6000
vehicles                6000
insects                 6000
people                  4745
household_furniture     3000
Name: count, dtype: int64

In [95]:
new_dataset = new_dataset.drop(new_dataset[new_dataset['label'] == 'outdoors'].sample(frac=.4).index)



In [96]:
def replace_label_by_id(label):
  return new_labels.index(label)

new_dataset['label'] = new_dataset['label'].map(replace_label_by_id)


In [97]:
new_dataset = new_dataset.drop(columns=['fine_label'])

new_dataset

Unnamed: 0,img,label
0,<PIL.PngImagePlugin.PngImageFile image mode=RG...,4
2,<PIL.PngImagePlugin.PngImageFile image mode=RG...,8
3,<PIL.PngImagePlugin.PngImageFile image mode=RG...,5
4,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0
6,<PIL.PngImagePlugin.PngImageFile image mode=RG...,7
...,...,...
2995,<PIL.Image.Image image mode=RGB size=32x32 at ...,8
2996,<PIL.Image.Image image mode=RGB size=32x32 at ...,8
2997,<PIL.Image.Image image mode=RGB size=32x32 at ...,8
2998,<PIL.Image.Image image mode=RGB size=32x32 at ...,8


In [119]:
from datasets import Dataset, Features, ClassLabel, Image
import io

def img_to_Image(img):
  feature = Image()
  return feature.encode_example(img)

features = Features({
  'label': ClassLabel(names=new_labels),
  'img': Image()
})

# convert images to Hf Image
new_dataset["img"] = new_dataset["img"].map(img_to_Image)


custom_cifar = Dataset.from_pandas(new_dataset)


CastError: Couldn't cast
img: struct<bytes: binary, path: null>
  child 0, bytes: binary
  child 1, path: null
label: int64
__index_level_0__: int64
-- schema metadata --
pandas: '{"index_columns": ["__index_level_0__"], "column_indexes": [{"na' + 541
to
{'label': ClassLabel(names=['aquatic_animals', 'household_furniture', 'small_objects', 'insects', 'land_animals', 'people', 'outdoors', 'vehicles', 'food'], id=None), 'img': Image(decode=True, id=None)}
because column names don't match

In [115]:
custom_cifar[0]


{'img': {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00 \x00\x00\x00 \x08\x02\x00\x00\x00\xfc\x18\xed\xa3\x00\x00\tjIDATx\x9c\x95\xd2\xc9\xb3\\U\x1d\x07\xf0s\xce=w\xee\xdb\xb7\xfb\xf6\x90\xd7\x9d7\xe5\xbd\x90\xe1%\x84D\x08jR\x91(\x08\xa8lt\xad\x1b\x8b\x85\xff\x8f\xc5V-\xab\xdc\xe1\x84\x82%\x82\xc1\x90\x07$\x81\x90\xe9%//\xe97\xf4<\xdd\xdb}\xe7\xe9\x0c.\x90\x85\x16\x0b\xf9\xae~U\xbf\xfa\xd6\xe7\xb7\xf8A\xce9\xf8\xba\xa1\xe9\xb0\xdd\xba~\xe3\xd6\xa5\x97^\xb5*\xd5\xff\xda\x00\x10Q\xea\x07\xcen\xeba\xb9\xa2\xb7\xdb\x8f1\x00\xe0k\x19\x8c\x12\x98\xcf\xfc\xf1\xee\x95\xb7\xfe\xe0\xfb\xc9O_\x7f\x1dp\xce\x18\x07\x08p\x00s\xc6\xfb\x83\xb63\xef\x0e:[\xbb\x8f\xa7\xae7C_\xd4\xe0\xff\x17\x00!\x82\x14R\x9f\xc7\x13\x9de\xf6`8\x1a\x8e\xa6\xe3)%\x04\x01\x08\x01\xe4\x9cc\x11\xe44\xae\x1c\xaa\x88\x1a\x18L\xfa\xf8\xab\xcf\x04\x80\xb3\x94\xcc\xa6\xb1\x1bpI/\x1en\x02\x88 g\x88\x11o\xd0\xd9\xbf\xff\xc9\xde\xc3m\x84$o\xd0\xfe\xe0\x9d\xdf\x97\x9bK\x17.^\x02\xb8h\xcf\xdd4\x18&\xc9\x98\x13\x7f\xec\xec\