In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [20]:
style_datasets_path = [
            "data/simulated_dataset/causal_shift/0.00.h5", 
            "data/simulated_dataset/causal_shift/0.10.h5", 
            "data/simulated_dataset/causal_shift/0.20.h5", 
            "data/simulated_dataset/causal_shift/0.30.h5", 
            "data/simulated_dataset/causal_shift/0.40.h5", 
            "data/simulated_dataset/causal_shift/0.50.h5", 
            "data/simulated_dataset/causal_shift/0.60.h5", 
            "data/simulated_dataset/causal_shift/0.70.h5", 
            "data/simulated_dataset/causal_shift/0.80.h5", 
            "data/simulated_dataset/causal_shift/0.90.h5",
            
            "data/simulated_dataset/output_noise/0.03.h5",
            "data/simulated_dataset/output_noise/0.05.h5",
            "data/simulated_dataset/output_noise/0.08.h5",
            "data/simulated_dataset/output_noise/0.10.h5",
            "data/simulated_dataset/output_noise/0.12.h5",
            "data/simulated_dataset/output_noise/0.15.h5",
            "data/simulated_dataset/output_noise/0.18.h5",
            "data/simulated_dataset/output_noise/0.20.h5",
            "data/simulated_dataset/output_noise/0.23.h5",
            "data/simulated_dataset/output_noise/0.25.h5",
            
            "data/simulated_dataset/input_noise/0.25.h5",
            "data/simulated_dataset/input_noise/0.50.h5",
            "data/simulated_dataset/input_noise/0.75.h5",
            "data/simulated_dataset/input_noise/1.00.h5",
            "data/simulated_dataset/input_noise/1.25.h5",
            "data/simulated_dataset/input_noise/1.50.h5",
            "data/simulated_dataset/input_noise/1.75.h5",
            "data/simulated_dataset/input_noise/2.00.h5",
            "data/simulated_dataset/input_noise/2.25.h5",
            "data/simulated_dataset/input_noise/2.50.h5",
        
            "data/simulated_dataset/time_shift/0.h5",
            "data/simulated_dataset/time_shift/2.h5",
            "data/simulated_dataset/time_shift/4.h5",
            "data/simulated_dataset/time_shift/6.h5",
            "data/simulated_dataset/time_shift/8.h5",
            "data/simulated_dataset/time_shift/10.h5",
            "data/simulated_dataset/time_shift/12.h5",
            "data/simulated_dataset/time_shift/14.h5",
            "data/simulated_dataset/time_shift/16.h5",
            "data/simulated_dataset/time_shift/18.h5"
            ]

content_path = [
    "data/simulated_dataset/01 - Source Domain.h5"
]

In [21]:
def remove_format(path:str):
    return ".".join(path.split('.')[:-1])

def make_train_valid_split_synthetic(dataset:pd.DataFrame, train_split:float=0.8):
    # Pour les labels dans les parametres. 
    # Mettre une partie dans Validation set
    # et le retirer du trainset.
    
    shape = dataset.shape
    train_end = int(shape[0]*train_split)
        
    train_dataset = dataset[:train_end]
    valid_dataset = dataset[train_end:]
    
    return train_dataset, valid_dataset

In [22]:
def get_scaler(dsets:list):
    
    dsets = [pd.read_hdf(d) for d in dsets]
    
    dsets = pd.concat(dsets)
    
    scaler = StandardScaler()
    
    scaler.fit(dsets.values)
    
    return scaler


def scale_dataset(df:pd.DataFrame, scaler:StandardScaler):
    _data = df.values
    scaled_data = scaler.transform(_data)
    _df =  pd.DataFrame(scaled_data, df.index, df.columns)
    # replacing labels by the unscaled labels.
    _df["labels"] = df['labels']
    return _df


def make_train_valid(path, scaler:StandardScaler=None):
    name_placeholder = remove_format(path)
    scaler_placeholder = ""
    
    dset = pd.read_hdf(path)
    if not scaler == None:
        dset = scale_dataset(dset, scaler)
        scaler_placeholder = 'standardized_'
        
    
    dset_train, dset_valid = make_train_valid_split_synthetic(dset)
    
    dset_train.to_hdf(f"{name_placeholder}_{scaler_placeholder}train.h5", key="data")
    dset_valid.to_hdf(f"{name_placeholder}_{scaler_placeholder}valid.h5", key="data")

## Make Unscaled Sequences.

In [23]:
make_train_valid(content_path[0])

In [24]:
for path in style_datasets_path:
    make_train_valid(path)

## Make Scaled Sequeces
### Amplitude Dataset. 

In [25]:
scaler = get_scaler(style_datasets_path)

make_train_valid(content_path[0], scaler=scaler)

for path in style_datasets_path:
    make_train_valid(path, scaler=scaler)