## Generative model for synthetic datasets with varying dimensions

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import datasets

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [51]:
def datasets_generator(n_major_samples, n_rare_samples, n_dimensions):
    """ Generative model for synthetic datasets with scaled size, dimensionality """
    random_state = np.random.RandomState(42)

    # generate two clusters 
    major_cluster = np.random.multivariate_normal([0, 0], [[10000, 0], [0, 10000]], n_major_samples)
    rare_cluster = np.random.multivariate_normal([600, 600], [[0.00001, 0], [0, 0.00001]], n_rare_samples)
    dataset = np.append(rare_cluster, major_cluster, axis=0)
    labelset = np.ones(dataset.shape[0])
    labelset[0:n_rare_samples] = 0
    labelset = np.reshape(labelset, newshape=(dataset.shape[0], 1))
    
    # add the additional dimensions
    if (n_dimensions > 2):
        additional_data = np.random.uniform(-300, 300, (dataset.shape[0], n_dimensions - 2))
        dataset = np.append(dataset, additional_data, axis=1)

    # total.shape
    return dataset, labelset

In [67]:
dataset, labelset = datasets_generator(100000, 3000, 5)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/5_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [64]:
dataset, labelset = datasets_generator(100000, 3000, 10)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/10_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [69]:
dataset, labelset = datasets_generator(100000, 3000, 15)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/15_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [65]:
dataset, labelset = datasets_generator(100000, 3000, 20)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/20_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [73]:
dataset, labelset = datasets_generator(100000, 3000, 25)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/25_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [71]:
dataset, labelset = datasets_generator(100000, 3000, 30)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/30_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [72]:
dataset, labelset = datasets_generator(100000, 3000, 35)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/35_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")

In [66]:
dataset, labelset = datasets_generator(100000, 3000, 40)
data_to_save = np.append(dataset, labelset, axis=1)
file_to_save = "/home/haiqw/Documents/my_projects/iforest/test/40_dimensional_synthetic_dataset.txt"
np.savetxt(X=data_to_save, fname=file_to_save, fmt="%.4f")