In [13]:
import pandas as pd
import numpy as np

import tarfile
import shutil
import urllib
import os
import copy

import hashlib
import json

import random
from itertools import chain
from typing import Any, Tuple
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
import torchvision
from torchvision import transforms
import torchvision.transforms.v2 as transforms2
from PIL import Image
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from tabulate import tabulate
from pathlib import Path

import warnings

In [2]:
import Synthetic_CIFAR_Expert.generate_synthetic_experts as synex

In [10]:
class BasicDatasetCIFAR100():
    """
    Contains the main Dataset with GT Label and Expert Label for every Image, sorted by file name
    """
    def __init__(self, path_labels, path_data):

        self.path_labels = path_labels
        self.cifar_dataset = self.load_cifar_100(path_data)
        self.gt_df = pd.DataFrame({"GT": self.cifar_dataset.targets})
        self.gt_df = self.gt_df.rename_axis("Image ID").reset_index()

        print("Number of images of the whole dataset: " + str(len(self.gt_df["Image ID"].values)))
        
    def load_cifar_100(self, path):
        """
        Loads the CIFAR100 dataset from pytorch
        """
        transform = transforms.Compose(
            [transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        return torchvision.datasets.CIFAR100(root=f'{path}/CIFAR100', train=True, download=True, transform=transform)

    def set_label_seed(self, seed):
        self.data = self.gt_df.copy()
        for name, expert_label in self.expert_labels[seed].items()
            self.data[name] = expert_label
        

    def init_synthetic_labels(self, experts_args, seed):
        path = self.path_labels
        self.expert_labels = {seed: {},}
        for name, expert_args in experts_args.items():
            path_name = self. expert_to_path(name, expert_args.copy(), seed)
            if self.check_if_expert_exists(path, path_name, expert_args["strength"]):
                self.expert_labels[seed][name] = self.load_expert_from_path(path, path_name, expert_args["strength"])
            else:
                self.expert_labels[seed][name] = self.create_synthetic_labels(expert_args, name, seed)["train"]
             
        return self.expert_labels

    def create_synthetic_labels(self, expert_args, name, seed):
        expert_labels = synex.generate_synthetic_expert(strength=expert_args["strength"], binary=expert_args["binary"], num_classes=expert_args["num_classes"], per_s=expert_args["per_s"], per_w=expert_args["per_w"], seed=seed, path=self.path_labels, name=name)
        return expert_labels

    def check_if_expert_exists(self, path, path_name, strength):
        my_file = Path(f'{path}/synthetic_experts/{path_name}/cifar100_expert_{strength}_labels.json')
        return my_file.is_file()

    def load_expert_from_path(self, path, path_name, strength):
        with open(f'{path}/synthetic_experts/{path_name}/cifar100_expert_{strength}_labels.json', 'r') as f:
            labels = json.load(f)
        return labels["train"]

    def expert_to_path(self, name, args, seed):
        args = args.copy()
        args["name"] = name
        args["seed"] = seed
        return self.param_to_path(args)

    def param_to_path(self, args):
        s = ""
        for key, value in args.items():
            s += f"{key}_{value}_"
        s = s[:-1]
        return int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % 10**8
    

    def getExpert(self, id):
        """
        Returns the data for the given expert
        """
        return self.data[["Image ID", "GT", str(id)]].copy()

    def getData(self):
        """
        Returns all data
        """
        return self.data.copy()
    
    def getDataForLabelers(self, labelerIds):
        """
        Returns the data with ["Image ID", "GT", [labelerIds]]
        """
        temp = self.data[["Image ID", "GT"]].copy()
        for labelerId in labelerIds:
            temp[str(labelerId)] = self.data[str(labelerId)]
        return temp

In [11]:
bd = BasicDatasetCIFAR100(path_labels="../../../Datasets/CIFAR100", path_data="../../../Datasets/CIFAR100")

Files already downloaded and verified
Number of images of the whole dataset: 50000


In [14]:
experts = {
    "123": {
        "strength": 60,
        "binary": False,
        "num_classes": 20,
        "per_s": 1.0,
        "per_w": 0.0
    }
}

bd.get_synthetic_labels("../../../Datasets/CIFAR100", experts, 42)

{42: {'123': [11.0,
   15.0,
   4.0,
   13.0,
   1.0,
   3.0,
   9.0,
   3.0,
   10.0,
   12.0,
   5.0,
   12.0,
   2.0,
   5.0,
   10.0,
   5.0,
   6.0,
   0.0,
   16.0,
   17.0,
   15.0,
   17.0,
   2.0,
   5.0,
   17.0,
   6.0,
   1.0,
   17.0,
   8.0,
   0.0,
   1.0,
   17.0,
   17.0,
   13.0,
   0.0,
   5.0,
   19.0,
   5.0,
   7.0,
   3.0,
   15.0,
   13.0,
   3.0,
   16.0,
   17.0,
   9.0,
   14.0,
   9.0,
   16.0,
   15.0,
   7.0,
   7.0,
   6.0,
   3.0,
   19.0,
   7.0,
   17.0,
   11.0,
   14.0,
   17.0,
   16.0,
   14.0,
   0.0,
   16.0,
   4.0,
   10.0,
   4.0,
   17.0,
   15.0,
   4.0,
   2.0,
   16.0,
   1.0,
   7.0,
   16.0,
   7.0,
   18.0,
   14.0,
   16.0,
   8.0,
   5.0,
   16.0,
   8.0,
   11.0,
   10.0,
   8.0,
   12.0,
   3.0,
   0.0,
   7.0,
   14.0,
   7.0,
   14.0,
   19.0,
   0.0,
   4.0,
   0.0,
   13.0,
   7.0,
   15.0,
   0.0,
   5.0,
   8.0,
   16.0,
   3.0,
   15.0,
   4.0,
   19.0,
   17.0,
   6.0,
   17.0,
   16.0,
   6.0,
   7.0,
   1.0,
   16.0,
   9.

In [17]:
bd.gt_df

Unnamed: 0,Image ID,GT
0,0,19
1,1,29
2,2,0
3,3,11
4,4,1
...,...,...
49995,49995,80
49996,49996,7
49997,49997,3
49998,49998,7


In [35]:
i = bd.gt_df.loc[bd.gt_df["Image ID"] == 3456]["GT"].item()

In [36]:
type(i)

int