In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [2]:
# Define directories for processed data and labels
processed_data_dir = r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data'
label_mapping_file = r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\labels.csv'  # Mapping of class labels to object names

In [None]:
# Load the label mapping (class names)
label_mapping_df = pd.read_csv(label_mapping_file)
class_names = label_mapping_df['Class'].values  # Class names (e.g., cat, dog, car, etc.)

In [8]:
# Check if 'Class' column exists and print the first few values
print("Class names from labels.csv file:")
print(label_mapping_df['Class'].head())  # Print the first few class names

Class names from labels.csv file:
0       cat
1       dog
2      fish
3    rabbit
4     horse
Name: Class, dtype: object


In [9]:
# Ensure class_names is a 1D array (e.g., ['cat', 'dog', 'fish', ...])
class_names = label_mapping_df['Class'].values  # Extract class names as a numpy array
print(f"Class names (total {len(class_names)} classes):", class_names)

Class names (total 19 classes): ['cat' 'dog' 'fish' 'rabbit' 'horse' 'elephant' 'car' 'truck' 'airplane'
 'bicycle' 'bus' 'house' 'tree' 'flower' 'computer' 'star' 'sun' 'moon'
 'smiley face']


In [10]:
def encode_labels(labels, class_names):
    """Convert labels to one-hot encoding."""
    # Ensure the labels are in a proper 1D format (list or numpy array)
    labels = np.array(labels).flatten()  # Ensure it's a 1D array

    # Initialize LabelEncoder
    label_encoder = LabelEncoder()
    label_encoder.fit(class_names)  # Fit the label encoder to the class names (no 'full_raw_' prefix)
    
    # Remove the 'full_raw_' prefix if it's part of the labels (fixing the KeyError)
    labels = [label.replace('full_raw_', '') for label in labels]
    
    # Convert labels to integers
    integer_labels = label_encoder.transform(labels)  # Convert labels to integers
    
    # One-Hot Encoding: Convert integer labels to one-hot encoding
    one_hot_labels = to_categorical(integer_labels, num_classes=len(class_names))  # One-hot encode
    return one_hot_labels, label_encoder.classes_

In [3]:
def process_labels_and_save(batch_index, label):
    """Load labels, apply one-hot encoding, and save them as .npy."""
    # Load the labels from the CSV file corresponding to the batch
    labels_file = os.path.join(processed_data_dir, f'full_raw_{label}_labels_{batch_index}.csv')
    labels_df = pd.read_csv(labels_file)
    
    # Extract the labels (ensure it's a 1D array of strings)
    labels = labels_df['label'].values.flatten()  # Flatten to ensure 1D array
    
    # Apply One-Hot Encoding
    num_classes = len(class_names)  # Total number of classes (19 in this case)
    one_hot_labels, _ = encode_labels(labels, class_names)  # Pass class_names to encode_labels
    
    # Save the one-hot encoded labels to a .npy file
    one_hot_labels_file = os.path.join(processed_data_dir, f'full_raw_{label}_one_hot_labels_{batch_index}.npy')
    np.save(one_hot_labels_file, one_hot_labels)  # Save as .npy for efficient loading later

    print(f"One-Hot Encoded Labels saved to: {one_hot_labels_file}")

In [12]:
# Process all label batches (from batch 1 to the total number of batches)
def process_all_batches():
    # List of labels (categories you are processing)
    labels = ['airplane', 'dog', 'cat', 'fish', 'truck', 'car', 'rabbit', 'elephant']  # Add all relevant categories

    # Loop through each label and batch
    for label in labels:
        for batch_index in range(1, 11):  # Assuming 10 batches (adjust as needed)
            print(f"Processing {label} Batch {batch_index}...")
            process_labels_and_save(batch_index, label)

In [13]:
# Run the process
process_all_batches()


Processing airplane Batch 1...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data\full_raw_airplane_one_hot_labels_1.npy
Processing airplane Batch 2...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data\full_raw_airplane_one_hot_labels_2.npy
Processing airplane Batch 3...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data\full_raw_airplane_one_hot_labels_3.npy
Processing airplane Batch 4...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data\full_raw_airplane_one_hot_labels_4.npy
Processing airplane Batch 5...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data\full_raw_airplane_one_hot_labels_5.npy
Processing airplane Batch 6...
One-Hot Encoded Labels saved to: C:\Users\ACER\gitClones\QuickDrawGame\model_training\dat

In [16]:
# Load one of the saved one-hot encoded label files

one_hot_labels = np.load(r'C:\Users\ACER\gitClones\QuickDrawGame\model_training/data/processed_data/full_raw_airplane_one_hot_labels_1.npy')

# Check the shape of the one-hot encoded labels (should be [batch_size, num_classes])
print(one_hot_labels.shape)

# Print the first 5 labels to check if they are one-hot encoded correctly
print(one_hot_labels[:5])


(100, 19)
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
import os
import numpy as np
import pandas as pd

processed_data_dir = r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data'
categories = ['airplane', 'dog', 'cat', 'fish', 'truck', 'car', 'rabbit', 'elephant', 'bicycle', 'bus', 'computer', 'flower', 'horse', 'house', 'moon', 'smiley face', 'sun', 'star', 'tree']

# Check if OHE files are missing for any categories
def check_missing_ohe_files():
    missing_ohe = []
    for category in categories:
        for batch_index in range(1, 11):
            ohe_file = os.path.join(processed_data_dir, f'full_raw_{category}_one_hot_labels_{batch_index}.npy')
            if not os.path.exists(ohe_file):
                missing_ohe.append(f"{category}_one_hot_labels_{batch_index}")
    return missing_ohe

# Function to preprocess missing OHE files
def preprocess_missing_ohe_files():
    missing_ohe = check_missing_ohe_files()

    if not missing_ohe:
        print("No missing OHE files.")
        return

    print("Missing OHE Files:", missing_ohe)
    
    # Re-run the processing for missing categories and save OHE labels
    for missing_file in missing_ohe:
        category = missing_file.split('_')[2]  # Extract the category name from the filename
        batch_index = int(missing_file.split('_')[-1].split('.')[0])  # Extract batch index

        print(f"Re-processing OHE labels for {category}, Batch {batch_index}...")
        process_labels_and_save(batch_index, category)  # Assuming you already have this function for processing OHE

    print("Missing OHE files reprocessed.")

# Call the function to process missing OHE files
preprocess_missing_ohe_files()


Missing OHE Files: ['bicycle_one_hot_labels_1', 'bicycle_one_hot_labels_2', 'bicycle_one_hot_labels_3', 'bicycle_one_hot_labels_4', 'bicycle_one_hot_labels_5', 'bicycle_one_hot_labels_6', 'bicycle_one_hot_labels_7', 'bicycle_one_hot_labels_8', 'bicycle_one_hot_labels_9', 'bicycle_one_hot_labels_10', 'bus_one_hot_labels_1', 'bus_one_hot_labels_2', 'bus_one_hot_labels_3', 'bus_one_hot_labels_4', 'bus_one_hot_labels_5', 'bus_one_hot_labels_6', 'bus_one_hot_labels_7', 'bus_one_hot_labels_8', 'bus_one_hot_labels_9', 'bus_one_hot_labels_10', 'computer_one_hot_labels_1', 'computer_one_hot_labels_2', 'computer_one_hot_labels_3', 'computer_one_hot_labels_4', 'computer_one_hot_labels_5', 'computer_one_hot_labels_6', 'computer_one_hot_labels_7', 'computer_one_hot_labels_8', 'computer_one_hot_labels_9', 'computer_one_hot_labels_10', 'flower_one_hot_labels_1', 'flower_one_hot_labels_2', 'flower_one_hot_labels_3', 'flower_one_hot_labels_4', 'flower_one_hot_labels_5', 'flower_one_hot_labels_6', 'flow

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ACER\\gitClones\\QuickDrawGame\\model_training\\data\\processed_data\\full_raw_hot_labels_1.csv'

In [7]:
import numpy as np
import os

# Define directories
processed_data_dir = r'C:\Users\ACER\gitClones\QuickDrawGame\model_training\data\processed_data'

# Function to preprocess missing OHE files
def preprocess_missing_ohe_files():
    missing_ohe = check_missing_ohe_files()

    if not missing_ohe:
        print("No missing OHE files.")
        return

    print("Missing OHE Files:", missing_ohe)
    
    # Re-run the processing for missing categories and save OHE labels
    for missing_file in missing_ohe:
        category = missing_file.split('_')[2]  # Extract the category name from the filename
        batch_index = int(missing_file.split('_')[-1].split('.')[0])  # Extract batch index

        print(f"Re-processing OHE labels for {category}, Batch {batch_index}...")
        process_labels_and_save(batch_index, category)  # Assuming you already have this function for processing OHE

    print("Missing OHE files reprocessed.")

# Function to check for missing OHE files
def check_missing_ohe_files():
    missing_ohe = []
    for category in categories:
        for batch_index in range(1, 11):  # Assuming 10 batches per label
            ohe_file = os.path.join(processed_data_dir, f'full_raw_{category}_one_hot_labels_{batch_index}.npy')
            if not os.path.exists(ohe_file):
                missing_ohe.append(f"{category}_one_hot_labels_{batch_index}")
    return missing_ohe

# Function to process and save the one-hot encoded labels
def process_labels_and_save(batch_index, category):
    # Load the labels from the corresponding batch
    label_file = os.path.join(processed_data_dir, f'full_raw_{category}_labels_{batch_index}.csv')  # Assuming you have .csv files for labels
    labels_df = pd.read_csv(label_file)
    
    # Extract the labels
    labels = labels_df['label'].values.flatten()  # Flatten to ensure it's a 1D array
    
    # Apply One-Hot Encoding
    one_hot_labels = encode_labels(labels, num_classes=19)
    
    # Save the one-hot encoded labels to a .npy file
    one_hot_labels_file = os.path.join(processed_data_dir, f'full_raw_{category}_one_hot_labels_{batch_index}.npy')
    np.save(one_hot_labels_file, one_hot_labels)  # Save as .npy for efficient loading later

    print(f"One-Hot Encoded Labels saved to: {one_hot_labels_file}")


preprocess_missing_ohe_files()


Missing OHE Files: ['bicycle_one_hot_labels_1', 'bicycle_one_hot_labels_2', 'bicycle_one_hot_labels_3', 'bicycle_one_hot_labels_4', 'bicycle_one_hot_labels_5', 'bicycle_one_hot_labels_6', 'bicycle_one_hot_labels_7', 'bicycle_one_hot_labels_8', 'bicycle_one_hot_labels_9', 'bicycle_one_hot_labels_10', 'bus_one_hot_labels_1', 'bus_one_hot_labels_2', 'bus_one_hot_labels_3', 'bus_one_hot_labels_4', 'bus_one_hot_labels_5', 'bus_one_hot_labels_6', 'bus_one_hot_labels_7', 'bus_one_hot_labels_8', 'bus_one_hot_labels_9', 'bus_one_hot_labels_10', 'computer_one_hot_labels_1', 'computer_one_hot_labels_2', 'computer_one_hot_labels_3', 'computer_one_hot_labels_4', 'computer_one_hot_labels_5', 'computer_one_hot_labels_6', 'computer_one_hot_labels_7', 'computer_one_hot_labels_8', 'computer_one_hot_labels_9', 'computer_one_hot_labels_10', 'flower_one_hot_labels_1', 'flower_one_hot_labels_2', 'flower_one_hot_labels_3', 'flower_one_hot_labels_4', 'flower_one_hot_labels_5', 'flower_one_hot_labels_6', 'flow

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ACER\\gitClones\\QuickDrawGame\\model_training\\data\\processed_data\\full_raw_hot_labels_1.csv'