<a href="https://colab.research.google.com/github/ILoveCoder999/FederatedLearning/blob/master/dataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import Subset, random_split

class FederatedDataBuilder:
    def __init__(self, root='./data', val_split_ratio=0.1, K=100):
        """
        Initialize the data builder.
        :param root: Directory to download/store the dataset.
        :param val_split_ratio: Ratio of the validation set size to the training set size.
        :param K: Total number of clients (Source 59, 62 typically sets K=100).
        """
        self.root = root
        self.K = K
        self.val_split_ratio = val_split_ratio

        # 1. Data Preprocessing
        # Note: DINO ViT might require specific transforms (e.g., resize to 224x224) depending on memory.
        # Using standard CIFAR transforms here as a baseline.
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
        ])

        # 2. Load original CIFAR-100 training set
        # The project requires CIFAR-100.
        full_train_dataset = torchvision.datasets.CIFAR100(
            root=self.root, train=True, download=True, transform=self.transform
        )
        self.test_dataset = torchvision.datasets.CIFAR100(
            root=self.root, train=False, download=True, transform=self.transform
        )

        # 3. Create Validation Split
        # CIFAR-100 does not have a validation split, so we must create one.
        val_size = int(len(full_train_dataset) * val_split_ratio) #val_size=5000
        train_size = len(full_train_dataset) - val_size  #train_size=45000

        # This 'train_dataset' will be used for subsequent FL client partitioning.
        # A fixed seed is used for reproducibility.
        self.train_dataset, self.val_dataset = random_split(
            full_train_dataset, [train_size, val_size],
            generator=torch.Generator().manual_seed(42)
        )

        # Extract targets/labels for the training subset to handle Non-IID logic.
        # Note: We need to map the subset indices back to the original dataset targets.
        self.train_indices = self.train_dataset.indices
        self.train_targets = np.array(full_train_dataset.targets)[self.train_indices]

    def get_iid_partition(self):
        """
        I.I.D. Sharding: Each client is given an approximately equal number of training
        samples uniformly distributed over the class labels .
        """
        print(f"Generating I.I.D. partition for {self.K} clients...")
        num_items = int(len(self.train_dataset) / self.K)
        dict_users, all_idxs = {}, [i for i in range(len(self.train_dataset))]

        # Randomly shuffle all indices to ensure uniform distribution
        np.random.shuffle(all_idxs)

        for i in range(self.K):
            # Assign a slice of indices to each client
            dict_users[i] = set(all_idxs[i * num_items : (i + 1) * num_items])

        return dict_users

    def get_non_iid_partition(self, Nc):
        """
        Non-I.I.D. Sharding: Each client is given an approximately equal number of training
        samples, belonging to Nc classes .

        :param Nc: The number of classes per client (Controls heterogeneity).
        """
        print(f"Generating Non-I.I.D. partition for {self.K} clients with Nc={Nc}...")

        # 1. Sort indices by label to group classes together
        idxs = np.arange(len(self.train_dataset))
        labels = self.train_targets

        # Stack indices and labels, then sort by labels
        idxs_labels = np.vstack((idxs, labels))
        idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
        idxs = idxs_labels[0, :] # Sorted indices
        labels = idxs_labels[1, :] # Sorted labels

        # 2. Create Shards
        # To ensure each client gets exactly Nc classes (approximately), we divide the
        # sorted dataset into K * Nc shards.
        total_shards = self.K * Nc
        shard_size = int(len(self.train_dataset) / total_shards)

        # Split the sorted indices into shards
        idx_shard = [idxs[i*shard_size : (i+1)*shard_size] for i in range(total_shards)]

        # 3. Assign Shards to Clients
        # Clients must have disjoint sets of training samples.
        dict_users = {i: np.array([], dtype='int64') for i in range(self.K)}
        available_shards = list(range(total_shards))

        for i in range(self.K):
            # Assign Nc shards to each client
            shards_to_assign = []
            for _ in range(Nc):
                # Randomly select a shard and remove it from the pool (no replacement)
                # to ensure the data subsets are disjoint.
                shard_idx = np.random.choice(available_shards)
                shards_to_assign.append(shard_idx)
                available_shards.remove(shard_idx)

            # Concatenate the selected shards for this client
            for shard_idx in shards_to_assign:
                dict_users[i] = np.concatenate((dict_users[i], idx_shard[shard_idx]), axis=0)

        return dict_users





In [8]:
if __name__ == "__main__":
    # 1. Initialize Builder
    # Note: First run will download CIFAR-100 to ./data
    builder = FederatedDataBuilder(val_split_ratio=0.1, K=100)

    # 2. Get I.I.D. partition
    iid_dict = builder.get_iid_partition()
    print(f"Client 0 IID sample count: {len(iid_dict[0])}")

    # 3. Get Non-I.I.D. partition (e.g., Nc=5)
    # You need to test with Nc={1, 5, 10, 50}
    non_iid_dict = builder.get_non_iid_partition(Nc=5)
    print(f"Client 0 Non-IID sample count: {len(non_iid_dict[0])}")

    # 4. Verification
    # Check if Client 0 actually has samples from approximately Nc classes
    client_0_indices = list(non_iid_dict[0])
    c0_labels = builder.train_targets[client_0_indices]
    unique_labels = np.unique(c0_labels)

    print(f"Client 0 has labels: {unique_labels}")
    print(f"Number of unique classes for Client 0: {len(unique_labels)}")

Overwriting /content/drive/MyDrive/preprocessing.py


In [None]:
%%writefile /content/drive/MyDrive/preprocessing.py

import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import Subset, random_split

class FederatedDataBuilder:
    def __init__(self, root='./data', val_split_ratio=0.1, K=100):
        """
        Initialize the data builder.
        :param root: Directory to download/store the dataset.
        :param val_split_ratio: Ratio of the validation set size to the training set size.
        :param K: Total number of clients (Source 59, 62 typically sets K=100).
        """
        self.root = root
        self.K = K
        self.val_split_ratio = val_split_ratio

        # 1. Data Preprocessing
        # Note: DINO ViT might require specific transforms (e.g., resize to 224x224) depending on memory.
        # Using standard CIFAR transforms here as a baseline.
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
        ])

        # 2. Load original CIFAR-100 training set
        # The project requires CIFAR-100.
        full_train_dataset = torchvision.datasets.CIFAR100(
            root=self.root, train=True, download=True, transform=self.transform
        )
        self.test_dataset = torchvision.datasets.CIFAR100(
            root=self.root, train=False, download=True, transform=self.transform
        )

        # 3. Create Validation Split
        # CIFAR-100 does not have a validation split, so we must create one.
        val_size = int(len(full_train_dataset) * val_split_ratio) #val_size=5000
        train_size = len(full_train_dataset) - val_size  #train_size=45000

        # This 'train_dataset' will be used for subsequent FL client partitioning.
        # A fixed seed is used for reproducibility.
        self.train_dataset, self.val_dataset = random_split(
            full_train_dataset, [train_size, val_size],
            generator=torch.Generator().manual_seed(42)
        )

        # Extract targets/labels for the training subset to handle Non-IID logic.
        # Note: We need to map the subset indices back to the original dataset targets.
        self.train_indices = self.train_dataset.indices
        self.train_targets = np.array(full_train_dataset.targets)[self.train_indices]

    def get_iid_partition(self):
        """
        I.I.D. Sharding: Each client is given an approximately equal number of training
        samples uniformly distributed over the class labels .
        """
        print(f"Generating I.I.D. partition for {self.K} clients...")
        num_items = int(len(self.train_dataset) / self.K)
        dict_users, all_idxs = {}, [i for i in range(len(self.train_dataset))]

        # Randomly shuffle all indices to ensure uniform distribution
        np.random.shuffle(all_idxs)

        for i in range(self.K):
            # Assign a slice of indices to each client
            dict_users[i] = set(all_idxs[i * num_items : (i + 1) * num_items])

        return dict_users

    def get_non_iid_partition(self, Nc):
        """
        Non-I.I.D. Sharding: Each client is given an approximately equal number of training
        samples, belonging to Nc classes .

        :param Nc: The number of classes per client (Controls heterogeneity).
        """
        print(f"Generating Non-I.I.D. partition for {self.K} clients with Nc={Nc}...")

        # 1. Sort indices by label to group classes together
        idxs = np.arange(len(self.train_dataset))
        labels = self.train_targets

        # Stack indices and labels, then sort by labels
        idxs_labels = np.vstack((idxs, labels))
        idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
        idxs = idxs_labels[0, :] # Sorted indices
        labels = idxs_labels[1, :] # Sorted labels

        # 2. Create Shards
        # To ensure each client gets exactly Nc classes (approximately), we divide the
        # sorted dataset into K * Nc shards.
        total_shards = self.K * Nc
        shard_size = int(len(self.train_dataset) / total_shards)

        # Split the sorted indices into shards
        idx_shard = [idxs[i*shard_size : (i+1)*shard_size] for i in range(total_shards)]

        # 3. Assign Shards to Clients
        # Clients must have disjoint sets of training samples.
        dict_users = {i: np.array([], dtype='int64') for i in range(self.K)}
        available_shards = list(range(total_shards))

        for i in range(self.K):
            # Assign Nc shards to each client
            shards_to_assign = []
            for _ in range(Nc):
                # Randomly select a shard and remove it from the pool (no replacement)
                # to ensure the data subsets are disjoint.
                shard_idx = np.random.choice(available_shards)
                shards_to_assign.append(shard_idx)
                available_shards.remove(shard_idx)

            # Concatenate the selected shards for this client
            for shard_idx in shards_to_assign:
                dict_users[i] = np.concatenate((dict_users[i], idx_shard[shard_idx]), axis=0)

        return dict_users



