In [6]:
import os
import shutil
import torch
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader

In [7]:
# Defining paths
image_base_path = "../data/images/"
categories = ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]

# Checking total files in each path
for category in categories:
    folder_path = os.path.join(image_base_path, category)
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)
        print(f"Category: {category} - Total files: {len(files)}")
    else:
        print(f"Folder not found: {folder_path}")

Category: GoogleEmoji - Total files: 3583
Category: JoyPixelsEmoji - Total files: 3820
Category: OpenMojiEmoji - Total files: 4284
Category: TwitterEmoji - Total files: 872


In [8]:
# Define processed image path
processed_image_path = "../data/processed_images/"

# Ensure the processed images folder exists
os.makedirs(processed_image_path, exist_ok=True)

def copy_and_rename_images(folder_path, category):
    category_processed_path = os.path.join(processed_image_path, category)
    os.makedirs(category_processed_path, exist_ok=True)  # Create subfolder for each category
    
    for filename in os.listdir(folder_path):
        # Remove "emoji_u" prefix if it exists
        if filename.startswith("emoji_u"):
            new_name = filename.replace("emoji_u", "", 1)  # Remove "emoji_u" prefix
        else:
            new_name = filename  # Keep as is if no "emoji_u"
        
        # Replace all "_" with "-"
        new_name = new_name.replace("_", "-").lower()
        
        # Remove '-fe0f' to standardize all button/text emoji hex codes
        new_name = new_name.replace("-fe0f", "")
        
        # Remove '-200d' to standardize all skin-tone emoji hex codes
        new_name = new_name.replace("-200d", "")
        
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(category_processed_path, new_name)

        # Copy the file instead of renaming
        shutil.copy(old_path, new_path)
        print(f"Copied: {filename} -> {new_name}")

# Process each category folder
for category in categories:
    folder_path = os.path.join(image_base_path, category)
    if os.path.exists(folder_path):
        print(f"Processing {category}...")
        copy_and_rename_images(folder_path, category)
    else:
        print(f"Folder not found: {folder_path}")

Processing GoogleEmoji...
Copied: emoji_u0023.png -> 0023.png
Copied: emoji_u0023_20e3.png -> 0023-20e3.png
Copied: emoji_u002a.png -> 002a.png
Copied: emoji_u002a_20e3.png -> 002a-20e3.png
Copied: emoji_u0030.png -> 0030.png
Copied: emoji_u0030_20e3.png -> 0030-20e3.png
Copied: emoji_u0031.png -> 0031.png
Copied: emoji_u0031_20e3.png -> 0031-20e3.png
Copied: emoji_u0032.png -> 0032.png
Copied: emoji_u0032_20e3.png -> 0032-20e3.png
Copied: emoji_u0033.png -> 0033.png
Copied: emoji_u0033_20e3.png -> 0033-20e3.png
Copied: emoji_u0034.png -> 0034.png
Copied: emoji_u0034_20e3.png -> 0034-20e3.png
Copied: emoji_u0035.png -> 0035.png
Copied: emoji_u0035_20e3.png -> 0035-20e3.png
Copied: emoji_u0036.png -> 0036.png
Copied: emoji_u0036_20e3.png -> 0036-20e3.png
Copied: emoji_u0037.png -> 0037.png
Copied: emoji_u0037_20e3.png -> 0037-20e3.png
Copied: emoji_u0038.png -> 0038.png
Copied: emoji_u0038_20e3.png -> 0038-20e3.png
Copied: emoji_u0039.png -> 0039.png
Copied: emoji_u0039_20e3.png -> 0039

In [12]:
# processed tensor images path
tensor_output_path = "../data/tensor_images/"
augmented_output_path = "../data/augmented_images/"

# Ensuring tensor output folders exist
os.makedirs(tensor_output_path, exist_ok=True)
os.makedirs(augmented_output_path, exist_ok=True)

# transformations
base_transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Resize images
    transforms.ToTensor(),  # Convert to PyTorch tensor
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to [-1,1]
])

# Augmentation transformations
augment_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.RandomHorizontalFlip(p=0.5),  # Horizontal flip with 50% probability
    transforms.RandomRotation(degrees=20),  # Rotate image randomly by 20 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2), # Adjust brightness and contrast
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

def process_images(category_folder, category):
    tensor_category_path = os.path.join(tensor_output_path, category)
    augmented_category_path = os.path.join(augmented_output_path, category)

    os.makedirs(tensor_category_path, exist_ok=True)
    os.makedirs(augmented_category_path, exist_ok=True)

    for filename in os.listdir(category_folder):
        img_path = os.path.join(category_folder, filename)

        try:
            # Open image and convert to RGB
            img = Image.open(img_path).convert('RGB')

            # Apply base transformation
            img_tensor = base_transform(img)

            # Save original processed tensor
            tensor_file = filename.replace(".png", ".pt").replace(".jpg", ".pt")
            torch.save(img_tensor, os.path.join(tensor_category_path, tensor_file))

            # Apply augmentation
            img_aug_tensor = augment_transform(img)

            # Save augmented tensor
            aug_tensor_file = tensor_file.replace(".pt", "_aug.pt")
            torch.save(img_aug_tensor, os.path.join(augmented_category_path, aug_tensor_file))

            print(f"Processed: {filename} -> {tensor_file} | Augmented -> {aug_tensor_file}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Process each category folder
for category in categories:
    processed_category_path = os.path.join(processed_image_path, category)
    if os.path.exists(processed_category_path):
        print(f"Processing images in {category}...")
        process_images(processed_category_path, category)
    else:
        print(f"Folder not found: {processed_category_path}")

print("All images copied, renamed, preprocessed, and augmented successfully!")

Processing images in GoogleEmoji...
Processed: 0023-20e3.png -> 0023-20e3.pt | Augmented -> 0023-20e3_aug.pt
Processed: 0023.png -> 0023.pt | Augmented -> 0023_aug.pt
Processed: 002a-20e3.png -> 002a-20e3.pt | Augmented -> 002a-20e3_aug.pt
Processed: 002a.png -> 002a.pt | Augmented -> 002a_aug.pt
Processed: 0030-20e3.png -> 0030-20e3.pt | Augmented -> 0030-20e3_aug.pt
Processed: 0030.png -> 0030.pt | Augmented -> 0030_aug.pt
Processed: 0031-20e3.png -> 0031-20e3.pt | Augmented -> 0031-20e3_aug.pt
Processed: 0031.png -> 0031.pt | Augmented -> 0031_aug.pt
Processed: 0032-20e3.png -> 0032-20e3.pt | Augmented -> 0032-20e3_aug.pt
Processed: 0032.png -> 0032.pt | Augmented -> 0032_aug.pt
Processed: 0033-20e3.png -> 0033-20e3.pt | Augmented -> 0033-20e3_aug.pt
Processed: 0033.png -> 0033.pt | Augmented -> 0033_aug.pt
Processed: 0034-20e3.png -> 0034-20e3.pt | Augmented -> 0034-20e3_aug.pt
Processed: 0034.png -> 0034.pt | Augmented -> 0034_aug.pt
Processed: 0035-20e3.png -> 0035-20e3.pt | Augm

In [13]:
class EmojiDataset(Dataset):
    def __init__(self, tensor_dir, augmented_dir, transform=None, is_augmented=False):
        self.tensor_dir = tensor_dir
        self.augmented_dir = augmented_dir
        self.transform = transform
        self.is_augmented = is_augmented  
        
        self.image_paths = []
        # Load all image paths from either tensor or augmented directories
        if self.is_augmented:
            for category in os.listdir(self.augmented_dir):
                category_folder = os.path.join(self.augmented_dir, category)
                if os.path.isdir(category_folder):
                    for img_file in os.listdir(category_folder):
                        if img_file.endswith(".pt"):
                            self.image_paths.append(os.path.join(category_folder, img_file))
        else:
            for category in os.listdir(self.tensor_dir):
                category_folder = os.path.join(self.tensor_dir, category)
                if os.path.isdir(category_folder):
                    for img_file in os.listdir(category_folder):
                        if img_file.endswith(".pt"):
                            self.image_paths.append(os.path.join(category_folder, img_file))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Get the image tensor path
        img_path = self.image_paths[idx]
        
        # Load the image tensor
        img_tensor = torch.load(img_path)

        if self.transform:
            img_tensor = self.transform(img_tensor)  
        return img_tensor
    
    

# Set whether to use augmented data or not
use_augmented_data = True  
transform = transforms.Compose([
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Create dataset for either original or augmented images
if use_augmented_data:
    dataset = EmojiDataset(tensor_output_path, augmented_output_path, transform=transform, is_augmented=True)
else:
    dataset = EmojiDataset(tensor_output_path, augmented_output_path, transform=transform, is_augmented=False)


In [15]:
# Define DataLoader
batch_size = 64  

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for i, data in enumerate(dataloader, 0):
    print(f"Batch {i + 1} processed!")
    print(f"Shape of batch {i + 1}: {data.shape}")
    break # Stop after the first batch

Batch 1 processed!
Shape of batch 1: torch.Size([64, 3, 64, 64])
