In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

In [2]:
for it in range(1, 11): #This loop runs 10 times, generating 10 datasets.
    # Parameters
    num_samples = 1000 #number of samples
    n_classes = 10 #number of classes
    n_dim = 3 #dimension of the initial space
    n_dim_high = 100 #dimension of the final space
    n_domains = 6 #number of domains

    # Generate random means vectors  and covariances matrices  for each component (class)
    means = np.random.randn(n_classes, n_dim) * 10
    covariances = []
    for _ in range(n_classes): 
        rand_matrix = np.random.rand(n_dim, n_dim)
        covariances.append(rand_matrix @ rand_matrix.T + 0.1 * np.eye(n_dim))

    # Normalize weights to sum to 1
    weights = np.random.rand(n_classes)
    weights /= np.sum(weights)

    n_samples_comp = np.random.multinomial(num_samples, np.array(weights))

    while 0 in n_samples_comp: 
        weights = np.random.rand(n_classes) #Weights are randomly assigned to each class
        weights /= np.sum(weights) #Weights are normalized. 
        n_samples_comp = np.random.multinomial(num_samples, np.array(weights)) # multinomial distribution is used to determine the number of samples for each class.

    def non_linear_array(x):
        vec = np.array([x[0], x[1], x[2], x[0]*x[1], x[0]*x[2], x[1]*x[2]])
        return vec

    data_list = []
    for k, (mean, covariance, sample) in enumerate(zip(means, covariances, n_samples_comp)):
        array_3d = np.random.multivariate_normal(mean, covariance, int(sample)) # samples are generated for each class
        array_non_linear = np.zeros((int(sample), 6)) # 6 car non linéarité (x, y, z, xy, xz, yz) 
        for i in range(int(sample)):
            array_non_linear[i] = non_linear_array(array_3d[i]) # samples are transformed through a nonlinear function for each class
        features = np.dot(array_non_linear, np.random.rand(6, n_dim_high))
        classes = k * np.ones(int(sample))
        data_list.append(np.concatenate([features, classes.reshape(-1, 1)], axis=1)) # samples are projected into a higher-dimensional space

    # Features and class labels are stored.
    data = np.vstack(data_list)
    data_feat = data[:, :-1]
    data_class = data[:, -1]

    def generate_random_spd_matrix(n): # generates a random symmetric positive definite matrix.
        A = np.random.randn(n, n)
        spd_matrix = np.dot(A, A.T)  # Ensure the matrix is symmetric
        spd_matrix += np.eye(n) * 0.1  # Add a small multiple of the identity matrix for positive definiteness
        spectral_radius = np.max(np.abs(np.linalg.eigvals(spd_matrix)))
        if spectral_radius > 2:
            spd_matrix /= (spectral_radius/2)
        return spd_matrix

    def affine_transformation(data_feat): # applies an affine transformation to the input features
        product_matrix = generate_random_spd_matrix(n_dim_high)
        sum_vector = np.random.randn(1, n_dim_high) #  generates a random translation vector  drawn from a standard normal distribution,
        new_data_feat = np.dot(data_feat, product_matrix) + sum_vector
        return new_data_feat

    domains = [data]
    for i in range(n_domains-1):
        new_data_feat = affine_transformation(data_feat)
        for c in range(n_classes):
            ind_c = np.where(data_class == c)[0]
            t_c = np.random.rand()
            new_data_feat[ind_c] = t_c * new_data_feat[ind_c]
        new_data = np.concatenate((new_data_feat, data_class.reshape(-1, 1)), axis=1) # Concatenating new features with class labels
        domains.append(new_data)

    # Creating toy_dataset by concatenating data from different domains
    toy_dataset = np.concatenate((domains[0], np.zeros((domains[0].shape[0], 1))), axis=1) 

    # Concatenating data from other domains with their respective domain identifiers
    for i in range(1, len(domains)):
        data_to_concatenate = np.concatenate((domains[i], i * np.ones((domains[0].shape[0], 1))), axis=1)
        toy_dataset = np.concatenate((toy_dataset, data_to_concatenate), axis=0)

In [3]:
np.save(f'toy_non_linear_100d_dataset_{it}.npy', toy_dataset) #Generates multiple data domains, each with different characteristics and distributions

In [4]:
toy_dataset

array([[  -2.17827735,   -2.17647148,   -0.75246805, ...,   -0.14755821,
           0.        ,    0.        ],
       [  -1.60883826,   -1.161039  ,    0.20230317, ...,    0.15217214,
           0.        ,    0.        ],
       [  -2.16490187,   -0.7130249 ,    0.88270958, ...,    0.35683743,
           0.        ,    0.        ],
       ...,
       [-107.24449407,  -33.55023123,  -22.33742109, ...,   -4.76467765,
           9.        ,    5.        ],
       [-110.52812989,  -35.2881432 ,  -22.4857931 , ...,   -4.10235962,
           9.        ,    5.        ],
       [-113.73232136,  -36.05163857,  -24.20457892, ...,   -0.98573243,
           9.        ,    5.        ]])

In [5]:
toy_dataset.shape

(6000, 102)

In [6]:
toy_dataset[:,101]

array([0., 0., 0., ..., 5., 5., 5.])

In [7]:
column_names = [f'feature_{i}' for i in range(toy_dataset.shape[1]-1)] + ['class']
toy_df = pd.DataFrame(toy_dataset, columns=column_names)

In [8]:
toy_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,class
0,-2.178277,-2.176471,-0.752468,-1.292253,-2.855503,-4.421482,-2.641409,-4.808061,-5.660008,0.373252,...,-6.933689,-7.41988,-1.185174,-8.027141,-6.572926,-6.262337,-8.479931,-0.147558,0.0,0.0
1,-1.608838,-1.161039,0.202303,-0.095506,-1.862376,-3.900784,-2.269209,-3.726158,-5.163128,1.146133,...,-7.062294,-6.75581,-1.144182,-7.791942,-6.265639,-6.05596,-8.276314,0.152172,0.0,0.0
2,-2.164902,-0.713025,0.88271,0.719481,-1.602797,-4.790271,-2.613463,-4.013003,-6.516062,1.424417,...,-8.959447,-8.223263,-1.636233,-9.846639,-7.502129,-7.213104,-10.845224,0.356837,0.0,0.0
3,-2.565462,-1.257563,0.273917,0.056492,-2.092867,-5.078811,-2.76936,-4.808027,-7.007969,1.152351,...,-9.393415,-8.759388,-1.563786,-10.184244,-8.259175,-7.624309,-11.307356,0.222111,0.0,0.0
4,2.712907,5.504582,8.820316,9.401104,4.766689,-0.603002,0.227091,6.076122,-0.280486,4.085174,...,-2.11161,-1.808092,-1.199029,-5.444298,1.793554,-2.706814,-4.941377,2.462225,0.0,0.0


In [9]:
toy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Columns: 102 entries, feature_0 to class
dtypes: float64(102)
memory usage: 4.7 MB


Descriptive statistics of the DataFrame:

In [10]:
toy_df.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,class
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,-16.403153,-14.917132,-24.891495,-19.222568,-17.900512,-10.917182,-21.507386,-21.665492,-13.470324,-10.489248,...,-14.354412,-4.752838,-10.763165,-19.02099,-10.483507,-16.363099,-16.08382,-5.393444,4.435,2.5
std,30.350205,23.928366,27.00289,20.977412,27.315007,26.814261,30.293835,36.231181,27.711347,31.503013,...,24.002956,24.006429,23.817472,24.63298,27.025208,26.94783,28.172878,16.383064,3.290225,1.707967
min,-145.382664,-115.839367,-143.710894,-113.540865,-146.681423,-121.244946,-146.531026,-250.255467,-129.806472,-206.851118,...,-128.13228,-152.384913,-125.051782,-138.776973,-129.671225,-101.072013,-163.121661,-98.57487,0.0,0.0
25%,-18.367895,-28.293301,-31.091332,-30.970797,-16.01861,-18.849244,-26.54694,-27.198627,-20.454233,-17.764679,...,-21.523312,-10.729285,-18.083403,-26.98086,-21.684811,-27.03545,-25.747512,-7.824436,2.0,1.0
50%,-6.883234,-6.118586,-17.236112,-11.927554,-9.584004,-3.386097,-9.635037,-12.378528,-5.144606,-3.381553,...,-6.845848,-2.030838,-1.528717,-10.943005,-7.218279,-5.371428,-9.360643,-1.17637,3.0,2.5
75%,-1.13834,0.612494,-6.613588,-2.568535,-2.666321,0.851825,-2.31498,-2.50752,0.55977,1.925669,...,-0.891738,1.530966,2.144665,-3.13048,1.736305,0.150235,-1.494177,1.39182,8.0,4.0
max,89.861012,87.295403,27.439123,60.419234,85.872047,59.142719,26.196923,73.827419,62.741295,57.492361,...,76.803272,112.191042,28.000482,26.427617,80.766588,64.493533,94.722812,56.028309,9.0,5.0
