In [None]:
target_gini = 0.79
#target_gini ranges from 0 till 0.8 in steps of 0.2
dataset_name = 'MNIST'
#could be 'MNIST' or 'coil20'

epoch = 10


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import numpy as np
import scipy


from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
#the "sparseoperations" Cython library was tested in Ubuntu 16.04. Please note that you may encounter some "solvable" issues if you compile it in Windows.
#import sparseoperations
import datetime

from scipy import sparse
import time
from sklearn import preprocessing
from scipy.io import loadmat
from scipy.io import savemat

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

from PIL import Image
from sklearn.model_selection import train_test_split
import urllib.request as urllib2
import errno
import os
import sys; sys.path.append(os.getcwd())
import argparse

from scipy.optimize import linear_sum_assignment #The linear_assignment function is deprecated in 0.21 and will be removed from 0.23, but sklearn.utils.linear_assignment_ can be replaced by scipy.optimize.linear_sum_assignment


In [None]:
def calculate_gini(x):
#given a certain distribution, this function calculates the gini coefficient
   total = 0
   for i, xi in enumerate(x[:-1], 1):
      total += np.sum(np.abs(xi - x[i:]))
      return total / (len(x) ** 2 * np.mean(x))

def gini_algorithm(target_gini, num_categories):
    # target_gini is the gini to be reached with the eventual distribution, num_categories is the number of categories in the eventual distribution
    print("Start gini-process")
    # Set the initial lower and upper bounds for the first element
    lower_bound = 0
    upper_bound = 100

    # Use binary search to find the appropriate value for the first element
    while True:
        mid = (lower_bound + upper_bound) / 2
        x = np.array([mid] + [1] * (num_categories - 1))
        gini_coeff = calculate_gini(x)

        if abs(gini_coeff - target_gini) < 1e-3:
            distribution = x / np.sum(x)
            print("Relative distribution over num_categories:", distribution)
            gini_coeff = calculate_gini(distribution)
            print("Gini coefficient:", gini_coeff)
            return gini_coeff, distribution

        elif gini_coeff > target_gini:
            upper_bound = mid
        else:
            lower_bound = mid


In [None]:

def load_data(name):

    if name == "coil20":
        mat = scipy.io.loadmat("/content/gdrive/MyDrive/datasets/Coil20.mat")
        print(mat['images'])
        X = mat['images']
        y = mat['labels']
        y = np.array([int(label[3:]) for label in y])
        print("labels", y)

        num_samples, height, width = X.shape
        X = X.reshape((num_samples, height * width))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
        scaler = preprocessing.StandardScaler().fit(X_train)
        print("type X_train", type(X_train))
        print("type y_train", type(y_train))
        print("first element X_train", X_train[0])
        print("first element y_train", y_train[0])

    elif name == "MNIST":
        import tensorflow as tf
        (X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
        #The reshaping operation converts the images from a grid of pixels to a vector representation
        X_train = X_train.reshape((X_train.shape[0],X_train.shape[1]*X_train.shape[2]))
        X_test  = X_test.reshape((X_test.shape[0],X_test.shape[1]*X_test.shape[2]))
        X_train = X_train.astype('float32')
        X_test  = X_test.astype('float32')
        #This scaler object is used to standardize the data based on the mean and standard deviation of the training set.
        scaler = preprocessing.StandardScaler().fit(X_train)
        print("type X_train", type(X_train))
        print("type y_train", type(y_train))

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test


def check_path(filename):
    import os
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise


In [None]:
import numpy as np

def imbalanced_data(X_train, y_train, X_test, y_test, distribution, max_iterations=25):
    # Get the number of classes
    num_classes = len(distribution)

    # Create a dictionary to store the indices of instances for each class
    class_indices_y_train = {class_label: np.where(y_train == class_label)[0] for class_label in np.unique(y_train)}
    # Get the count of instances in each class based on the desired distribution
    original_counts_y_train = np.array([len(class_indices_y_train[class_label]) for class_label in np.unique(y_train)])
    # Initialize the adjusted counts with the original counts
    adjusted_counts_y_train = original_counts_y_train.copy()
    print("original_counts_y_train", adjusted_counts_y_train)
    # Calculate the target counts based on the desired distribution
    target_counts_y_train = np.round(distribution * np.sum(original_counts_y_train)).astype(int)
    # Limit the target counts to not exceed the original counts
    target_counts_y_train = np.minimum(target_counts_y_train, original_counts_y_train)
    print("target_counts _y_train", target_counts_y_train)

    # Create new empty lists to store the adjusted data during each iteration
    adjusted_X_train_iteration = []
    adjusted_y_train_iteration = []
    adjusted_X_train = X_train
    adjusted_y_train = y_train

    # --------------------- Iteration adjustment training data ---------------------
    # Iteratively adjust the dataset until the desired distribution is reached or the maximum number of iterations is reached
    iteration = 0
    while not np.allclose(adjusted_counts_y_train / np.sum(adjusted_counts_y_train), distribution,
                          atol=1e-2) and int(iteration) < int(max_iterations):

        print("WHILE LOOP FOR TRAINING DATA STARTED ITERATION:", iteration)

        adjusted_X_train = np.array(adjusted_X_train)
        adjusted_y_train = np.array(adjusted_y_train)

        class_indices_y_train = {class_label: np.where(adjusted_y_train == class_label)[0] for class_label in np.unique(adjusted_y_train)}
        target_counts_y_train = np.round(distribution * 6634).astype(int)  # Set target counts to 6634
        # Limit the target counts to not exceed the original counts
        target_counts_y_train = np.minimum(target_counts_y_train, adjusted_counts_y_train)
        print("target_counts_y_train loop", target_counts_y_train)

        adjusted_X_train_iteration.clear()  # Clear the list for each iteration
        adjusted_y_train_iteration.clear()  # Clear the list for each iteration

        for class_label in np.unique(adjusted_y_train):
            class_indices_for_label_y_train = class_indices_y_train[class_label]
            class_indices_sampled_y_train = np.random.choice(class_indices_for_label_y_train, size=target_counts_y_train[class_label], replace=False)
            adjusted_X_train_iteration.extend(adjusted_X_train[class_indices_sampled_y_train])
            adjusted_y_train_iteration.extend(adjusted_y_train[class_indices_sampled_y_train])

        # Update the adjusted counts based on the adjusted dataset
        adjusted_counts_y_train = np.array(
            [len([label for label in adjusted_y_train_iteration if label == class_label]) for class_label in
            np.unique(adjusted_y_train)])
        print("adjusted_counts_y_train", adjusted_counts_y_train)

        # Update the main adjusted data lists with the iteration-adjustedlists
        adjusted_X_train = adjusted_X_train_iteration.copy()
        adjusted_y_train = adjusted_y_train_iteration.copy()

        iteration += 1

    print("WHILE LOOP FOR TRAINING DATA STOPPED AT ITERATION:", iteration)
    print("adjusted_counts_y_train / np.sum(adjusted_counts_y_train)", adjusted_counts_y_train / np.sum(adjusted_counts_y_train))
    print("distribution", distribution)

    adjusted_X_train = np.array(adjusted_X_train)
    adjusted_y_train = np.array(adjusted_y_train)
    adjusted_X_test = np.array(X_test)
    adjusted_y_test = np.array(y_test)

    print("Number of cases in adjusted_y_train:", adjusted_y_train.shape[0])
    print("Number of cases in adjusted_y_test:", adjusted_y_test.shape[0])

    return adjusted_X_train, adjusted_y_train, adjusted_X_test, adjusted_y_test


In [None]:
X_train, Y_train, X_test, Y_test = load_data(dataset_name)
print("1. Data loaded")
num_categories = len(set(Y_train))
gini, distribution = gini_algorithm(target_gini, num_categories)
print("2. Gini_algorithm done")
adjusted_X_train, adjusted_y_train, adjusted_X_test, adjusted_y_test = imbalanced_data(X_train, Y_train, X_test, Y_test, distribution)

type X_train <class 'numpy.ndarray'>
type y_train <class 'numpy.ndarray'>
1. Data loaded
Start gini-process
Relative distribution over num_categories: [0.89082638 0.0121304  0.0121304  0.0121304  0.0121304  0.0121304
 0.0121304  0.0121304  0.0121304  0.0121304 ]
Gini coefficient: 0.7908263836239575
2. Gini_algorithm done
original_counts_y_train [5923 6742 5958 6131 5842 5421 5918 6265 5851 5949]
target_counts _y_train [5923  728  728  728  728  728  728  728  728  728]
WHILE LOOP FOR TRAINING DATA STARTED ITERATION: 0
target_counts_y_train loop [5910   80   80   80   80   80   80   80   80   80]
adjusted_counts_y_train [5910   80   80   80   80   80   80   80   80   80]
WHILE LOOP FOR TRAINING DATA STOPPED AT ITERATION: 1
adjusted_counts_y_train / np.sum(adjusted_counts_y_train) [0.89140271 0.01206637 0.01206637 0.01206637 0.01206637 0.01206637
 0.01206637 0.01206637 0.01206637 0.01206637]
distribution [0.89082638 0.0121304  0.0121304  0.0121304  0.0121304  0.0121304
 0.0121304  0.0121

In [None]:
# Save the modified dataset
amount_of_cases = adjusted_y_train.shape[0]

# Construct the new filename
filename = f"/content/gdrive/MyDrive/datasets/MNIST_{target_gini}.npz"

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(filename), exist_ok=True)

# Save the modified dataset
np.savez(filename, X_train=adjusted_X_train, y_train=adjusted_y_train, X_test=adjusted_X_test, y_test=adjusted_y_test)

print("Modified dataset saved successfully.")

Modified dataset saved successfully.
