## Dataset Generation

Generation of datasets for multiclass classification and regression cases

### Multiclass classification

In [1]:
from sklearn.datasets import make_classification
import pandas as pd

Because we want to know which are the real informative features, we generate the data without shuffeling.
Then we safe the the names of the features with x for informative and y for the rest. Random state is set to 42 so the datasets will always be the same.

In [24]:
"""
Dataset 1
20000 samples with 100 features with 15 informative features, 10 classes. 
There are no redundant or repeated features,
all classes have the same weights.
"""
data1 = make_classification(n_samples=20000,n_features=100, n_classes=10, n_informative=15, shuffle=False, random_state=42)

# create feature names
columns = []
for i in range(100):
    if i < 15:
        columns.append("x"+str(i))
    else:
        columns.append("y"+str(i))

# create dataframe
data1_df = pd.DataFrame(data1[0], columns=columns)

# shuffle features
data1_df = data1_df.sample(frac=1, axis=1)

# add label
data1_df["label"] = data1[1]

# shuffle rows
data1_df = data1_df.sample(frac=1).reset_index(drop=True)

# split into train and test
data1_df_train = data1_df.iloc[:14999,]
data1_df_test = data1_df.iloc[15000:,]

#write data
data1_df_train.to_csv("dataset_1_train.csv", header=False,index=False)
data1_df_test.to_csv("dataset_1_test.csv", index=False)

In [27]:

"""
Dataset 2
50000 samples with 500 features with 80 informative features, 6 classes. 
There are 50 redundant and repeated features,
all classes have the same weights.
"""
data2 = make_classification(n_samples=50000,n_features=500, n_classes=6, n_informative=80, n_redundant=50, n_repeated=50, shuffle=False, random_state=42)

# create feature names
columns = []
for i in range(500):
    if i < 80:
        columns.append("x"+str(i))
    else:
        columns.append("y"+str(i))

# create dataframe
data2_df = pd.DataFrame(data2[0], columns=columns)

# shuffle features
data2_df = data2_df.sample(frac=1, axis=1)

# add label
data2_df["label"] = data2[1]

# shuffle rows
data2_df = data2_df.sample(frac=1).reset_index(drop=True)

# split into train and test
data2_df_train = data2_df.iloc[:39999,]
data2_df_test = data2_df.iloc[40000:,]

#write data
data2_df_train.to_csv("dataset_2_train.csv", header=False,index=False)
data2_df_test.to_csv("dataset_2_test.csv", index=False)

In [28]:
"""
Dataset 3
5000 samples with 250 features with 20 informative features, 8 classes. 
There are 50 redundant and repeated features,
all classes have the different weights.

"""
weights = [0.1, 0.05, 0.15, 0.2, 0.025, 0.125, 0.075, 0.275]
data3 = make_classification(n_samples=5000,n_features=250, n_classes=8, n_informative=20, n_redundant=50, n_repeated=50, shuffle=False, random_state=42, weights=weights)

# create feature names
columns = []
for i in range(250):
    if i < 20:
        columns.append("x"+str(i))
    else:
        columns.append("y"+str(i))

# create dataframe
data3_df = pd.DataFrame(data3[0], columns=columns)

# shuffle features
data3_df = data3_df.sample(frac=1, axis=1)

# add label
data3_df["label"] = data3[1]

# shuffle rows
data3_df = data3_df.sample(frac=1).reset_index(drop=True)

# split into train and test
data3_df_train = data3_df.iloc[:4199,]
data3_df_test = data3_df.iloc[4200:,]

#write data
data3_df_train.to_csv("dataset_3_train.csv", header=False,index=False)
data3_df_test.to_csv("dataset_3_test.csv", index=False)

In [6]:
"""
Dataset 4
50000 samples with 200 features with 200 informative features, 10 classes. 
"""
data4 = make_classification(n_samples=50000,n_features=210, n_classes=10, n_informative=200, random_state=42)


# create dataframe
data4_df = pd.DataFrame(data4[0])

# add label
data4_df["label"] = data4[1]

# shuffle rows
data4_df = data4_df.sample(frac=1).reset_index(drop=True)

# split into train and test
data4_df_train = data4_df.iloc[:44999,]
data4_df_test = data4_df.iloc[45000:,]

#write data
data4_df_train.to_csv("dataset_4_train.csv", header=False,index=False)
data4_df_test.to_csv("dataset_4_test.csv", index=False)

### Regression Case

In [1]:
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

Here we can get the informative features via the coef parameter. So we don't need to shuffle the result.

In [17]:
"""
Dataset1

"""

data1 = make_regression(n_samples=20000, n_features=150, n_informative=25, coef=True, random_state=42)

data1_df = pd.DataFrame(data1[0], columns = ['x'+str(i) for i in range(150)])

data1_df_norm = pd.DataFrame(MinMaxScaler().fit_transform(data1_df))
data1_df_norm.columns = data1_df.columns


data1_df["y"] = data1[1]
data1_df_norm["y"] = data1[1]

informative_ftrs = data1_df.columns[np.where(data1[2] != 0)[0]]

print(informative_ftrs)

# shall labels also be scaled?
scaler = MinMaxScaler()
data1_df_norm = pd.DataFrame(scaler.fit_transform(data1_df))
data1_df_norm.columns = data1_df.columns

# split into train and test
data1_df_train = data1_df.iloc[:16999,]
data1_df_test = data1_df.iloc[17000:,]
data1_df_norm_train = data1_df_norm.iloc[:16999,]
data1_df_norm_test = data1_df_norm.iloc[17000:,]

#write data
data1_df_train.to_csv("dataset_1_train.csv", header=False,index=False)
data1_df_test.to_csv("dataset_1_test.csv", index=False)
data1_df_norm_train.to_csv("dataset_1_norm_train.csv", header=False,index=False)
data1_df_norm_test.to_csv("dataset_1_norm_test.csv", index=False)

Index(['x27', 'x30', 'x31', 'x33', 'x36', 'x37', 'x42', 'x43', 'x44', 'x47',
       'x50', 'x62', 'x70', 'x71', 'x75', 'x83', 'x84', 'x92', 'x106', 'x111',
       'x121', 'x127', 'x129', 'x130', 'x134'],
      dtype='object')
