In [None]:
import os
import tarfile
import scipy.io
import pandas as pd

## Convert meta.mat to synsets.csv

In [None]:
# Convert .mat to .csv
# Load the .mat file
mat_data = scipy.io.loadmat('data/meta.mat')

# Check the keys in the .mat file to identify the data variable
print(mat_data.keys())

# Extract the relevant variable from the .mat file
synsets = mat_data['synsets']

# Extract the fields from the structured array and convert them to a list of dictionaries
data = []
for i in range(synsets.shape[0]):
    synset_info = synsets[i, 0]  # Each entry is a struct, so we extract it
    synset_data = {
        'ILSVRC2012_ID': synset_info['ILSVRC2012_ID'][0],
        'WNID': synset_info['WNID'][0],
        'words': synset_info['words'][0],
        'gloss': synset_info['gloss'][0],
        'num_children': synset_info['num_children'][0],
        'children': synset_info['children'][0],
        'wordnet_height': synset_info['wordnet_height'][0],
        'num_train_images': synset_info['num_train_images'][0]
    }
    data.append(synset_data)

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Save as a CSV file
df.to_csv('data/synsets.csv', index=False)  # Save the DataFrame to a .csv file
print('meta.mat successfully converted to synsets.csv')

# The validation folder has all images in one folder and a val_annotations.txt file with bounding box annotations (filename, class label, x/y coordinates, height, width).
val_data = pd.read_csv(f'data/synsets.csv')
val_data.head()

## Dataset paths

In [None]:
# Define training, validation, testing data paths directly in the current directory
TRAIN_DIR = os.path.join('data', 'train', 'zipped_1000')
VALID_DIR = os.path.join('data', 'val')
TEST_DIR = os.path.join('data', 'test')

# Print the paths to confirm
print(f"Training data path: {TRAIN_DIR}")
print(f"Validation data path: {VALID_DIR}")
print(f"Validation data path: {TEST_DIR}")

## Train Dataset - Extraction

In [None]:
# Store  all the 1000 WNIDs in train dataset
WNID = []
# Get the list of image filenames in the directory and sort them in ascending order
file_names = sorted(os.listdir(TRAIN_DIR))

# Exatract all the 1000 zipped files according to their WNIDs
for file_name in file_names:
    # Check if the file is a .tar file
    if file_name.endswith('.tar'):
        # Extract the WNID by removing the .tar extension
        wnid = file_name.replace('.tar', '')
        WNID.append(wnid)
        # Rename folder name considering their WNIDs
        tar_file_path = os.path.join(TRAIN_DIR, file_name)
        extract_folder = os.path.join('data', 'train', 'unzipped_1000', wnid)
        # Create the folder with WNID as its name
        if not os.path.exists(extract_folder):
            os.makedirs(extract_folder)
            # Extract the folder
            with tarfile.open(tar_file_path, 'r') as tar_ref:
                tar_ref.extractall(extract_folder)

print(f"Extracted all 1000 classes in training dataset.")

## Train Dataset - ClassIDs

In [None]:
# synsets file path
synsets_file_path = 'data/synsets.csv'
# Load the synsets.csv using pandas
synsets_df = pd.read_csv(synsets_file_path)
# Filter only the relevant columns (ILSVRC2012_ID, WNID, words)
synsets_df = synsets_df[['ILSVRC2012_ID', 'WNID', 'words']]
# Remove []-brakets in ILSVRC2012_ID
synsets_df['ILSVRC2012_ID'] = synsets_df['ILSVRC2012_ID'].str.replace(r'\[|\]', '', regex=True)
# Filter the rows where WNID is in the list extracted earlier
train_df = synsets_df[synsets_df['WNID'].isin(WNID)]
# Sort the DataFrame by the 'WNID' column in ascending order
train_df_sorted = train_df.sort_values(by='WNID', ascending=True)
# Add a new column 'Class Label' with values from 1 to the length of the DataFrame
train_df_sorted['Class Label'] = range(0, len(train_df_sorted))
# Save the filtered data to a new CSV file
train_csv_path = 'data/classes_train.csv'
train_df_sorted.to_csv(train_csv_path, index=False)

print(f"Saved all the Training dataset class information to {train_csv_path}.")

## Validation Dataset - ClassIDs

In [None]:
# Initialize an empty list to hold the validation ILSVRC2012_IDs
val_ILSVRC2012_ID = []
# Open the file and read the numbers line by line
with open('data/ILSVRC2012_validation_ground_truth.txt', 'r') as file:
    for line in file:
        # Convert each line to an integer and append it to the list
        val_ILSVRC2012_ID.append(str(line.strip()))  # .strip() removes any extra whitespace or newline characters

# Filter the rows where val_ILSVRC2012_ID is in the list extracted earlier
val_df = synsets_df[synsets_df['ILSVRC2012_ID'].isin(val_ILSVRC2012_ID)]
# Sort the DataFrame by the 'WNID' column in ascending order
val_df_sorted = val_df.sort_values(by='WNID', ascending=True)
# Add a new column 'Class Label' with values from 1 to the length of the DataFrame
val_df_sorted['Class Label'] = range(0, len(val_df_sorted))
# Save the filtered data to a new CSV file
val_csv_path = 'data/classes_val.csv'
val_df_sorted.to_csv(val_csv_path, index=False)

print(f"Saved all the Validation dataset class information to {val_csv_path}.")

## Create val_annotation.txt

In [None]:
# Initialize an empty list to hold the validation Image Names ans Class label
val_images = []
val_class = []
# Get the list of image filenames in the directory and sort them in ascending order
image_names = sorted(os.listdir(VALID_DIR))

# Extract all validation image names
for img_name in image_names:
    # Check if the file is a .JPEG file
    if img_name.endswith('.JPEG'):
        val_images.append(img_name)

print(f"Extracted all image names in validation dataset.")

# Open the file and read the numbers line by line
with open('data/ILSVRC2012_validation_ground_truth.txt', 'r') as file:
    for line in file:
        val_class.append(line.strip())

print(f"Extracted all image labels in validation dataset.")

# Open the file in write mode
with open('data/val_annotation.txt', 'w') as f:
    # Iterate through the image names & label lists
    for item1, item2 in zip(val_images, val_class):
        # Filter the DataFrame by the given ILSVRC2012_ID (item2)
        wnid, label = val_df_sorted.loc[val_df_sorted['ILSVRC2012_ID'] == item2, ['WNID', 'Class Label']].iloc[0]
        # item2 - 1, to match with the 
        f.write(f'{item1}, {wnid}, {label}\n')

print(f"val_annotation.txt created.")