In [5]:
import generate_datasets
import pandas as pd
import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, \
  RobustScaler, QuantileTransformer, PowerTransformer

In [6]:
DATASETS_DIR = "datasets"
OUT_DIR = "scaled_datasets"
RANDOM_STATE = 10

In [7]:
if not os.path.exists(OUT_DIR):
  os.mkdir(OUT_DIR)

In [8]:
datasets = []
ratios, weights = generate_datasets.get_imbalance()
for dataset in os.listdir(DATASETS_DIR):
  dataset_path = os.path.join(DATASETS_DIR, dataset)
  dataset_names = [dataset + f'_w_{w:.3f}.csv' for w in weights]
  datasets_df = [pd.read_csv(os.path.join(dataset_path, dataset_name)) for dataset_name in dataset_names]
  datasets.append(datasets_df)

In [9]:
scalers = {
  'MM': MinMaxScaler(),
  'SS': StandardScaler(),
  'MA': MaxAbsScaler(),
  'RS': RobustScaler(),
  'QT': QuantileTransformer(),
  'PT': PowerTransformer(method='yeo-johnson', standardize=True)
}

In [10]:
results = {}

for name, scaler in scalers.items():
  scaled_datasets = []
  for i in range(len(datasets)):
    scaled_weight = []
    for j in range(len(weights)):
      dataset = datasets[i][j]
      X = dataset.iloc[:, :-1]
      y = dataset.iloc[:, -1]
      X = scaler.fit_transform(X)
      scaled_dataset = pd.concat([pd.DataFrame(X), y], axis=1)
      scaled_weight.append(scaled_dataset)
    scaled_datasets.append(scaled_weight)
  results[name] = scaled_datasets

In [11]:
for i in range(len(datasets)):
  for j in range(len(weights)):
    base_path = f"{OUT_DIR}/dataset_{i+1}/w_{weights[j]:.3f}"
    if not os.path.exists(base_path):
      os.makedirs(base_path)
    datasets[i][j].to_csv(f"{base_path}/original.csv", index=False)
    for name, result in results.items():
      results[name][i][j].to_csv(f"{base_path}/{name}.csv", index=False)