In [1]:
import pandas as pd
import numpy as np

data_path = "./data/"

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# Read Signals Data File
signals_training = pd.read_csv(data_path + "wind-farm-1-signals-training.csv", sep=";")
signals_training['Timestamp'] = pd.to_datetime(signals_training['Timestamp'])
signals_training = signals_training.groupby(['Turbine_ID', 'Timestamp']).last().reset_index()
turbines_id = list(pd.unique(signals_training['Turbine_ID']))

# Seperate Systems
signal_variables = signals_training.columns.to_list()
bearings_variables = list(filter(lambda x: x.startswith("Gen_Bear"), signal_variables))
generator_variables = list(filter(lambda x: x.startswith("Gen") and not x.startswith("Gen_Bear"), signal_variables))
gearbox_variables = list(filter(lambda x: x.startswith("Gear"), signal_variables))
transformer_variables = list(filter(lambda x: x.startswith("HVTrafo"), signal_variables))
hydraulic_variables = list(filter(lambda x: x.startswith("Hyd"), signal_variables))

# Seperate System Signals
bearings = signals_training[['Turbine_ID','Timestamp'] + bearings_variables]
generators = signals_training[['Turbine_ID','Timestamp'] + generator_variables]
gearbox = signals_training[['Turbine_ID','Timestamp'] + gearbox_variables]
transformer = signals_training[['Turbine_ID','Timestamp'] + transformer_variables]
hydraulic = signals_training[['Turbine_ID','Timestamp'] + hydraulic_variables]

In [4]:
# Read Weather Data File
weather_training = pd.read_csv(data_path + "wind-farm-1-metmast-training.csv", sep=";")
weather_training['Timestamp'] = pd.to_datetime(weather_training['Timestamp'])
weather_training = weather_training.groupby(['Timestamp']).first().reset_index()
weather_variables = weather_training.columns.to_list()
weather_variables.remove('Timestamp')

In [5]:
# Read Default Data File
failures = pd.read_csv(data_path + "htw-failures-training.csv")
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.round('10T')
failures['Label'] = 1

In [7]:
"""
    At the exact time of some defaults, no signal values were available
    Record the time, turbine and system of defaults that will be missing after merge
"""

missing_signals = {
    'GENERATOR': [],
    'HYDRAULIC_GROUP': [],
    'GENERATOR_BEARING': [],
    'TRANSFORMER': [],
    'GEARBOX': []
}

for _, row in failures.iterrows():
    idx, time, comp = row['Turbine_ID'], row['Timestamp'], row['Component']
    data = signal_dic[comp]
    data = data[(data['Turbine_ID'] == idx) & (data['Timestamp'] == time)]
    if(len(data) == 0):
        print(f'The following signal data is missing when defaults happen: {idx}, {time}, {comp}')
        missing_signals[comp].append(pd.DataFrame({'Turbine_ID': idx, 'Timestamp': time, 'Label': 1}, index=[0]))

The following signal data is missing when defaults happen: T11, 2016-03-03 19:00:00+00:00, GENERATOR
The following signal data is missing when defaults happen: T06, 2016-07-11 19:50:00+00:00, GENERATOR
The following signal data is missing when defaults happen: T06, 2017-08-19 09:50:00+00:00, HYDRAULIC_GROUP


In [8]:
# Calculate the lead time of default
def get_Lead_Time(group):
    group['Next_Default_Date'] = group['Timestamp'].where(group['Label'] == 1).fillna(method='bfill')
    
    # Fillna with a infinite future date when no record of default
    infinite_future_date = pd.to_datetime('2262-04-11 00:00:00')
    infinite_future_date = infinite_future_date.tz_localize('UTC')
    group['Next_Default_Date'].fillna(infinite_future_date, inplace=True)

    group['Lead_Time'] = (group['Next_Default_Date'] - group['Timestamp']).dt.days
    return group

In [6]:
"""
    Combine the signals, weather and default data
    Append the missing data recorded
    Fillna using forward fill
    Calculate lead time and whether there is default in 60 days
"""

signal_dic = {
    'GENERATOR': generators,
    'HYDRAULIC_GROUP': hydraulic,
    'GENERATOR_BEARING': bearings,
    'TRANSFORMER': transformer,
    'GEARBOX': gearbox
}

processed = {}

for comp, data in signal_dic.items():
    temp = pd.merge(
        data, 
        weather_training, 
        how='left', 
        on='Timestamp'
        )
    temp = pd.merge(
        temp, 
        failures[failures['Component'] == comp][['Turbine_ID','Timestamp','Label']], 
        how='left', 
        on=['Turbine_ID','Timestamp']
        )
    for line in missing_signals[comp]:
        temp = pd.concat([temp, line])
    temp.sort_values(by=['Turbine_ID', 'Timestamp'], inplace=True)
    temp.Label.fillna(0, inplace=True)
    temp = temp.groupby('Turbine_ID').apply(lambda x: x.fillna(method='ffill')).reset_index(drop=True)
    temp = temp.groupby('Turbine_ID').apply(get_Lead_Time).reset_index(drop=True)
    temp['Default_in_60'] = temp['Lead_Time'] <= 60
    processed[comp] = temp

In [10]:
# Check that no NaN is in processed data
for comp, data in processed.items():
    assert(data.isna().sum().sum() == 0)

In [11]:
save_data_path = "./data/"
for comp, data in processed.items():
    data.to_csv(save_data_path+comp+"_processed.csv", index=False)