In [3]:
import numpy as np 
import pandas as pd 
import os
import random
from sklearn.model_selection import train_test_split

# a kaggle directory path leading to the dataset and the train folder in it containing all the datasets for training, validation and testing.
data = '/kaggle/input/tiny-imagenet/tiny-imagenet-200/train'

Counting and choosing a random 100 classes, 10 of which is printed to understand how the classes are named. 
The classes are all name using their respective file names.

In [19]:
print("We have ", len(os.listdir(data)), " number of classes.")
classes = random.sample(os.listdir(data), 100) # picking random 100 classes
print(classes[:10]) # printing the first 10 classes for understanding

We have  200  number of classes.
['n01698640', 'n01944390', 'n02321529', 'n01629819', 'n03042490', 'n03599486', 'n02950826', 'n03706229', 'n01443537', 'n03837869']


Create a list to store image data, loop through files, call each image and lopp through each images to be appended onto the list.

In [5]:
# list to store image data
data_list = []

#loop through each class
for class_name in classes:
    class_dir = os.path.join(data, class_name, 'images') # call the images using the path and class names 
    img = random.sample(os.listdir(class_dir), 500) # take a random 500 images 

    # loop through each image
    for i in img: 
        data_list.append({'label': class_name})

The reason for creating a list holding label and class_name is so that it will be easier to be converted into a dataframe. Working with dataframes are more convenient as numerous methods are laid out for data processing using dataframes.

In [6]:
df = pd.DataFrame(data_list) # converting list into a datframe 

# Splitting the dataset into training df holding 30,000 samples and a temporary set holding the remaining samples.
# the random state is there to identify the randomly picked data points when running the model multiple times.
#stratify helps with maintaining balance when splitting dataset
train_df, temp_df = train_test_split(df, train_size=30000, random_state=42, stratify=df['label'])

# Splitting the temp dataset into validation df holding 10,000 samples and a test set holding the remaining 10,000 samples.
val_df, test_df = train_test_split(temp_df, train_size=10000, random_state=42, stratify=temp_df['label'])

To verify the total images computed.

In [7]:
print("Total images - ", len(df))
df.info()

Total images -  50000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   50000 non-null  object
dtypes: object(1)
memory usage: 390.8+ KB


To show the shape of the training dataset matches with the prompt.

In [8]:
print("Training dataframe shape - ", train_df.shape)
train_df.info()

Training dataframe shape -  (30000, 1)
<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 6644 to 4763
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   30000 non-null  object
dtypes: object(1)
memory usage: 468.8+ KB


To show the shape of the validation dataset matches with the prompt.

In [9]:
print("Validation dataframe shape - ", val_df.shape)
val_df.info()

Validation dataframe shape -  (10000, 1)
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 32746 to 3866
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10000 non-null  object
dtypes: object(1)
memory usage: 156.2+ KB


To show the shape of the validation dataset matches with the prompt.

In [10]:
print("Testing dataframe shape - ", test_df.shape)
test_df.info()

Testing dataframe shape -  (10000, 1)
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 38296 to 17640
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10000 non-null  object
dtypes: object(1)
memory usage: 156.2+ KB


References:
https://www.kaggle.com/datasets/akash2sharma/tiny-imagenet