In [None]:
import pandas as pd
from os import walk
from sklearn import model_selection, preprocessing
import pickle

In [None]:
# Choose location for SCADA and Acceleration data
loc = 'bbk05' # location

# Laod wave data 
#path = r'D:\OneDrive\PhD_ULIEGE\03_PhD_Year3\05_VUB\Westhinder20180101_20190630'
path = 'DATA/Westhinder20180101_20191231'
filenames = next(walk(path), (None, None, []))[2]  
li = []
for f in filenames:
    df = pd.read_csv('\\'.join([path,f]), delimiter = "\t")
    df.rename(columns = {df.columns[0]:'timestamp'}, inplace = True)
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')
    df.set_index('timestamp',inplace=True)
    li.append(df)
df_wave_nw = pd.concat(li, join='inner', axis=1)
# Resampling from 30 min to 10 min
df_wave_nw = df_wave_nw.resample('10min').mean().interpolate()

# Load SCADA (depends on location)
df = pd.read_pickle('DATA/'+loc)

# Load Acceleration and join with SCADA 
df_acc = pd.read_pickle('DATA/'+loc+'_acc')
# remove the column with location (cannot have overlapped columns with the joining one)
df_acc = df_acc.drop(columns="location")
df = df_acc.join(df)

# Join SCADA(+Acceleration) and wave data
df = df_wave_nw.join(df)
df = df.drop(columns="location")
# print(df.columns)

# remove invalid data
df = df.dropna(how='any',axis=0)
nsamp = len(df)
print(nsamp)


In [None]:
# Save the normalizer, only for the fleet-leader
if loc=='bbg10': 
    Input = df.iloc[:,0:-2]  # The last two columns are DEMs
    std_scaler = preprocessing.StandardScaler().fit(Input)
    # Save the data normalizer
    object = std_scaler
    filehandler = open('Weights/Norm', 'wb') 
    pickle.dump(object, filehandler)
    filehandler.close() 

In [None]:
# For extrapolated turbines, save input/output files for test
index = df.columns
df_input = df[index[0:-2]]
df_output = df[index[-2:]]
df_input.to_pickle('DATA/'+loc+'df_input')
df_output.to_pickle('DATA/'+loc+'df_output')

# For fleet-leader, split input/output files for train and test and save as separate files
if loc=='bbg10':
    index = df.columns
    df_input = df[index[0:-2]]
    df_output = df[index[-2:]]
    train_input, test_input, train_output, test_output = model_selection.train_test_split(df_input, df_output,
                                                                                      test_size=0.25, shuffle=True)
    train_input = train_input.sort_index()
    train_output = train_output.sort_index()
    test_input = test_input.sort_index()
    test_output = test_output.sort_index()
    
    train_input.to_pickle('DATA/'+'train_input')
    train_output.to_pickle('DATA/'+'train_output')
    test_input.to_pickle('DATA/'+'test_input')
    test_output.to_pickle('DATA/'+'test_output')