In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
style_datasets_path = [
            "data/simulated_dataset/amplitude_shift/1.0_1.0.h5", 
            "data/simulated_dataset/amplitude_shift/2.0_2.0.h5", 
            "data/simulated_dataset/amplitude_shift/3.0_3.0.h5", 
            "data/simulated_dataset/amplitude_shift/4.0_4.0.h5", 
            "data/simulated_dataset/amplitude_shift/5.0_5.0.h5", 
            "data/simulated_dataset/amplitude_shift/6.0_6.0.h5", 
            "data/simulated_dataset/amplitude_shift/7.0_7.0.h5" , 
            "data/simulated_dataset/amplitude_shift/8.0_8.0.h5" , 
            "data/simulated_dataset/amplitude_shift/9.0_9.0.h5" , 
            "data/simulated_dataset/amplitude_shift/10.0_10.0.h5",
            
            "data/simulated_dataset/output_noise/0.25.h5",
            "data/simulated_dataset/output_noise/0.50.h5",
            "data/simulated_dataset/output_noise/0.75.h5",
            "data/simulated_dataset/output_noise/1.00.h5",
            "data/simulated_dataset/output_noise/1.25.h5",
            "data/simulated_dataset/output_noise/1.50.h5",
            "data/simulated_dataset/output_noise/1.75.h5",
            "data/simulated_dataset/output_noise/2.00.h5",
            "data/simulated_dataset/output_noise/2.25.h5",
            "data/simulated_dataset/output_noise/2.50.h5",
             
            "data/simulated_dataset/time_shift/0.h5",
            "data/simulated_dataset/time_shift/2.h5",
            "data/simulated_dataset/time_shift/4.h5",
            "data/simulated_dataset/time_shift/6.h5",
            "data/simulated_dataset/time_shift/8.h5",
            "data/simulated_dataset/time_shift/10.h5",
            "data/simulated_dataset/time_shift/12.h5",
            "data/simulated_dataset/time_shift/14.h5",
            "data/simulated_dataset/time_shift/16.h5",
            "data/simulated_dataset/time_shift/18.h5"
            ]

content_path = [
    "data/simulated_dataset/01 - Source Domain.h5"
]

In [3]:
def remove_format(path:str):
    return ".".join(path.split('.')[:-1])

def make_train_valid_split_synthetic(dataset:pd.DataFrame, train_split:float=0.8):
    # Pour les labels dans les parametres. 
    # Mettre une partie dans Validation set
    # et le retirer du trainset.
    
    shape = dataset.shape
    train_end = int(shape[0]*train_split)
        
    train_dataset = dataset[:train_end]
    valid_dataset = dataset[train_end:]
    
    return train_dataset, valid_dataset

In [4]:
def get_scaler(dsets:list):
    dsets = [pd.read_hdf(d) for d in dsets]
    dsets = pd.concat(dsets)
        
    scaler = StandardScaler()
    scaler.fit(dsets.values)
    return scaler


def scale_dataset(df:pd.DataFrame, scaler:StandardScaler):
    _data = df.values
    scaled_data = scaler.transform(_data)
    _df =  pd.DataFrame(scaled_data, df.index, df.columns)
    # replacing labels by the unscaled labels.
    _df["labels"] = df['labels']
    return _df


def make_train_valid(path, scaler:StandardScaler=None):
    name_placeholder = remove_format(path)
    scaler_placeholder = ""
    
    dset = pd.read_hdf(path)
    if not scaler == None:
        dset = scale_dataset(dset, scaler)
        scaler_placeholder = 'standardized_'
        
    
    dset_train, dset_valid = make_train_valid_split_synthetic(dset)
    
    dset_train.to_hdf(f"{name_placeholder}_{scaler_placeholder}train.h5", key="data")
    dset_valid.to_hdf(f"{name_placeholder}_{scaler_placeholder}valid.h5", key="data")

## Make Unscaled Sequences.

In [5]:
make_train_valid(content_path[0])

In [6]:
for path in style_datasets_path:
    make_train_valid(path)

## Make Scaled Sequeces
### Amplitude Dataset. 

In [7]:
scaler = get_scaler(style_datasets_path)

make_train_valid(content_path[0], scaler=scaler)

for path in style_datasets_path:
    make_train_valid(path, scaler=scaler)

In [8]:
pd.read_hdf("data/simulated_dataset/01 - Source Domain_train.h5").head()

Unnamed: 0,in_c1,in_c2,out_c1,out_c2,out_c3,out_c4,material velocity,labels
0,9.918148,10.065971,3.115878,3.162318,0.0,0.0,0.1,2.0
1,10.901223,11.493444,4.705588,5.506856,1.18097,0.590485,0.100942,2.0
2,12.530237,12.294746,5.879032,7.160271,2.728799,1.3644,0.101884,2.0
3,13.718603,13.854074,6.74434,8.645757,4.312329,2.156164,0.102823,2.0
4,13.871663,14.541064,7.150464,9.748231,5.823211,2.911605,0.10376,2.0


In [9]:
pd.read_hdf("data/simulated_dataset/01 - Source Domain_scaled_train.h5").head()

Unnamed: 0,in_c1,in_c2,out_c1,out_c2,out_c3,out_c4,material velocity,labels
0,0.469816,0.47753,0.421769,0.339022,0.228026,0.329764,0.5,2.0
1,0.516383,0.54525,0.474584,0.41707,0.261976,0.350526,0.515705,2.0
2,0.593548,0.583263,0.51357,0.47211,0.306472,0.377737,0.531395,2.0
3,0.64984,0.657238,0.542318,0.521561,0.351995,0.405575,0.547054,2.0
4,0.657091,0.689829,0.555811,0.558261,0.395429,0.432137,0.562667,2.0
