In [3]:
import pandas as pd
import numpy as np

data_path = "./data/"

import warnings
warnings.filterwarnings(action="ignore")

In [4]:
# Read Signals Data File
signals_training = pd.read_csv(data_path + "wind-farm-1-signals-training.csv", sep=";")
signals_training['Timestamp'] = pd.to_datetime(signals_training['Timestamp'])
signals_training = signals_training.groupby(['Turbine_ID', 'Timestamp']).last().reset_index()
turbines_id = list(pd.unique(signals_training['Turbine_ID']))

# Seperate Systems
signal_variables = signals_training.columns.to_list()
bearings_variables = list(filter(lambda x: x.startswith("Gen_Bear"), signal_variables))
generator_variables = list(filter(lambda x: x.startswith("Gen") and not x.startswith("Gen_Bear"), signal_variables))
gearbox_variables = list(filter(lambda x: x.startswith("Gear"), signal_variables))
transformer_variables = list(filter(lambda x: x.startswith("HVTrafo"), signal_variables))
hydraulic_variables = list(filter(lambda x: x.startswith("Hyd"), signal_variables))

# Seperate System Signals
bearings = signals_training[['Turbine_ID','Timestamp'] + bearings_variables]
generators = signals_training[['Turbine_ID','Timestamp'] + generator_variables]
gearbox = signals_training[['Turbine_ID','Timestamp'] + gearbox_variables]
transformer = signals_training[['Turbine_ID','Timestamp'] + transformer_variables]
hydraulic = signals_training[['Turbine_ID','Timestamp'] + hydraulic_variables]

In [5]:
# Read Weather Data File
weather_training = pd.read_csv(data_path + "wind-farm-1-metmast-training.csv", sep=";")
weather_training['Timestamp'] = pd.to_datetime(weather_training['Timestamp'])
weather_training = weather_training.groupby(['Timestamp']).first().reset_index()
weather_variables = weather_training.columns.to_list()
weather_variables.remove('Timestamp')

In [4]:
# Read Default Data File
failures = pd.read_csv(data_path + "htw-failures-training.csv")
failures['Timestamp'] = pd.to_datetime(failures['Timestamp'])
failures['Timestamp'] = failures['Timestamp'].dt.round('10T')
failures['Label'] = 1

In [5]:
"""
    At the exact time of some defaults, no signal values were available
    Record the time, turbine and system of defaults that will be missing after merge
"""

signal_dic = {
    'GENERATOR': generators,
    'HYDRAULIC_GROUP': hydraulic,
    'GENERATOR_BEARING': bearings,
    'TRANSFORMER': transformer,
    'GEARBOX': gearbox
}

missing_signals = {
    'GENERATOR': [],
    'HYDRAULIC_GROUP': [],
    'GENERATOR_BEARING': [],
    'TRANSFORMER': [],
    'GEARBOX': []
}

for _, row in failures.iterrows():
    idx, time, comp = row['Turbine_ID'], row['Timestamp'], row['Component']
    data = signal_dic[comp]
    data = data[(data['Turbine_ID'] == idx) & (data['Timestamp'] == time)]
    if(len(data) == 0):
        print(f'The following signal data is missing when defaults happen: {idx}, {time}, {comp}')
        missing_signals[comp].append(pd.DataFrame({'Turbine_ID': idx, 'Timestamp': time, 'Label': 1}, index=[0]))

The following signal data is missing when defaults happen: T11, 2016-03-03 19:00:00+00:00, GENERATOR
The following signal data is missing when defaults happen: T06, 2016-07-11 19:50:00+00:00, GENERATOR
The following signal data is missing when defaults happen: T06, 2017-08-19 09:50:00+00:00, HYDRAULIC_GROUP


In [6]:
# Calculate the lead time of default
def get_Lead_Time(group):
    group['Next_Default_Date'] = group['Timestamp'].where(group['Label'] == 1).fillna(method='bfill')
    
    # Fillna with a infinite future date when no record of default
    infinite_future_date = pd.to_datetime('2262-04-11 00:00:00')
    infinite_future_date = infinite_future_date.tz_localize('UTC')
    group['Next_Default_Date'].fillna(infinite_future_date, inplace=True)

    group['Lead_Time'] = (group['Next_Default_Date'] - group['Timestamp']).dt.days
    return group

In [7]:
"""
    Combine the signals, weather and default data
    Append the missing data recorded
    Fillna using forward fill
    Calculate lead time and whether there is default in 60 days
"""

processed = {}

for comp, data in signal_dic.items():
    temp = pd.merge(
        data, 
        weather_training, 
        how='left', 
        on='Timestamp'
        )
    temp = pd.merge(
        temp, 
        failures[failures['Component'] == comp][['Turbine_ID','Timestamp','Label']], 
        how='left', 
        on=['Turbine_ID','Timestamp']
        )
    for line in missing_signals[comp]:
        temp = pd.concat([temp, line])
    temp.sort_values(by=['Turbine_ID', 'Timestamp'], inplace=True)
    temp.Label.fillna(0, inplace=True)
    temp = temp.groupby('Turbine_ID').apply(lambda x: x.fillna(method='ffill')).reset_index(drop=True)
    temp = temp.groupby('Turbine_ID').apply(get_Lead_Time).reset_index(drop=True)
    temp['Default_in_60'] = temp['Lead_Time'] <= 60
    processed[comp] = temp

In [8]:
def remove_last_negative(group):
    group_sorted = group.sort_values(by='Timestamp', ascending=False).reset_index(drop=True)
    last_timestamp = group_sorted.iloc[0, 1]
    idx_to_cut = None
    for idx, row in group_sorted.iterrows():
        curr_time, will_default = row['Timestamp'], row['Default_in_60']
        if will_default:
            idx_to_cut = idx
            break
        elif (last_timestamp - curr_time).days >= 60:
            idx_to_cut = idx
            break
        else:
            continue
    return group_sorted.iloc[idx_to_cut:, :].sort_values(by='Timestamp').reset_index(drop=True)

In [9]:
for comp, data in processed.items():
    processed[comp] = data.groupby('Turbine_ID').apply(lambda group: remove_last_negative(group)).reset_index(drop=True)

In [10]:
# Check that no NaN is in processed data
for comp, data in processed.items():
    assert(data.isna().sum().sum() == 0)

# Save data files
save_data_path = "./data/"
for comp, data in processed.items():
    data.to_csv(save_data_path+comp+"_processed.csv", index=False)

In [11]:
"""
    Under the assumption that subsystems might not be independent of each other
    Signal data from one subsystem might be helpful in detecting defaults in another system
    We thus create an all signals data file that combines all data
"""
all_time = pd.DataFrame()
for comp, data in processed.items():
    all_time = pd.concat([all_time, data[['Turbine_ID','Timestamp']]]).drop_duplicates()
all_time = pd.concat([all_time, failures[['Turbine_ID','Timestamp']]]).drop_duplicates()

temp = processed['GENERATOR']
temp_var = temp.columns.to_list()
temp_var[-4:] = [v + "_GENERATOR" for v in temp_var[-4:]]
temp.columns = temp_var

temp = pd.merge(all_time, temp, how="left", on=['Turbine_ID','Timestamp'])

for key in ['HYDRAULIC_GROUP', 'GENERATOR_BEARING', 'TRANSFORMER', 'GEARBOX']:
    df = processed[key]
    df_var = df.columns.to_list()
    df_var[-4:] = [v + f"_{key}" for v in df_var[-4:]]
    df.columns = df_var
    df = pd.merge(all_time, df, how="left", on=['Turbine_ID','Timestamp']).drop(columns=weather_variables)
    temp = pd.merge(temp, df, how="left", on=['Turbine_ID','Timestamp'])

feat = temp.columns.to_list()
filtered_feat = [f for f in feat if f not in signal_variables and f not in weather_variables]
temp = temp[['Turbine_ID', 'Timestamp'] + generator_variables + hydraulic_variables + bearings_variables + \
            transformer_variables + gearbox_variables + weather_variables + filtered_feat]

def fill_next_default_nan(group):
    group.sort_values(by='Timestamp', inplace=True)
    for label in ['GENERATOR', 'HYDRAULIC_GROUP', 'GENERATOR_BEARING', 'TRANSFORMER', 'GEARBOX']:
        group['Label_' + label] = group['Label_' + label].fillna(0)
        group['Next_Default_Date_' + label] = group['Next_Default_Date_' + label].fillna(method='bfill')
        group['Lead_Time_' + label] = (group['Next_Default_Date_' + label] - group['Timestamp']).dt.days
        group['Default_in_60_' + label] = group['Lead_Time_' + label] <= 60
    return group

temp = temp.groupby('Turbine_ID').apply(fill_next_default_nan).reset_index(drop=True)
temp = temp.sort_values(by=['Turbine_ID', 'Timestamp']).groupby('Turbine_ID').apply(lambda x: x.fillna(method='ffill')).reset_index(drop=True)

assert(temp.isna().sum().sum() == 0)
temp.to_csv("./data/all_signals_processed.csv", index=False)

In [12]:
for label in ['GENERATOR', 'HYDRAULIC_GROUP', 'GENERATOR_BEARING', 'TRANSFORMER', 'GEARBOX']:
    assert(sum(temp[f'Label_{label}']) == sum(failures['Component'] == label))

In [13]:
temp['Date'] = temp['Timestamp'].dt.date
temp['Hour'] = temp['Timestamp'].dt.hour
temp['Date'] = pd.to_datetime(temp['Date'])
temp['Timestamp'] = temp.apply(
    lambda row: pd.Timestamp(
        year=row['Date'].year, 
        month=row['Date'].month, 
        day=row['Date'].day, 
        hour=row['Hour']), 
    axis=1)
temp['Timestamp'] = pd.to_datetime(temp['Timestamp'])
temp.drop(columns=['Date', 'Hour'], inplace=True)

In [14]:
feat_mean = temp.iloc[:, :-20].groupby(by=["Turbine_ID", "Timestamp"]).mean().reset_index()
label_max = pd.concat([temp.iloc[:, :2], temp.iloc[:, -20:]], axis=1).groupby(by=["Turbine_ID", "Timestamp"]).max().reset_index()
df_hourly = pd.merge(feat_mean, label_max, how="left", on=["Turbine_ID", "Timestamp"])

In [15]:
def get_Lead_Time_hourly(group):
    for system in ['GENERATOR', 'HYDRAULIC_GROUP', 'GENERATOR_BEARING', 'TRANSFORMER', 'GEARBOX']:
        group[f'Next_Default_Date_{system}'] = group['Timestamp'].where(group[f'Label_{system}'] == 1).fillna(method='bfill')
    
        # Fillna with a infinite future date when no record of default
        infinite_future_date = pd.to_datetime('2262-04-11 00:00:00')
        group[f'Next_Default_Date_{system}'].fillna(infinite_future_date, inplace=True)

        group[f'Lead_Time_{system}'] = (group[f'Next_Default_Date_{system}'] - group['Timestamp']).dt.days
        group[f'Default_in_60_{system}'] = group[f'Lead_Time_{system}'] <= 60
    return group

In [16]:
df_hourly = df_hourly.groupby(by="Turbine_ID").apply(get_Lead_Time_hourly).reset_index(drop=True)

In [18]:
assert(df_hourly.isna().sum().sum() == 0)
df_hourly.to_csv("./data/all_signals_hourly_processed.csv", index=False)

In [12]:
signals_testing = pd.read_csv("./data/wind-farm-1-signals-testing.csv", sep=";")
signals_testing['Timestamp'] = pd.to_datetime(signals_testing['Timestamp'])
signals_testing = signals_testing.groupby(['Turbine_ID', 'Timestamp']).last().reset_index()

weather_testing = pd.read_csv("./data/wind-farm-1-metmast-testing.csv", sep=";")
weather_testing['Timestamp'] = pd.to_datetime(weather_testing['Timestamp'])
weather_testing = weather_testing.groupby(['Timestamp']).last().reset_index()

In [13]:
test_start_time = signals_testing['Timestamp'].min()
print(f"Test Set Start Time: {test_start_time}")

Test Set Start Time: 2017-09-01 00:10:00+00:00


In [14]:
extra_signals = signals_training[signals_training['Timestamp'] >= test_start_time - np.timedelta64(14, 'D')]
signals_testing = pd.concat([extra_signals, signals_testing])

extra_weather = weather_training[weather_training['Timestamp'] >= test_start_time - np.timedelta64(14, 'D')]
weather_testing = pd.concat([extra_weather, weather_testing])
weather_testing = weather_testing.sort_values(by="Timestamp").fillna(method="ffill")

In [15]:
features_testing = pd.merge(signals_testing, weather_testing, on='Timestamp', how="left")
assert(features_testing.isna().sum().sum() == 0)

In [16]:
features_testing['Date'] = features_testing['Timestamp'].dt.date
features_testing['Hour'] = features_testing['Timestamp'].dt.hour
features_testing['Date'] = pd.to_datetime(features_testing['Date'])
features_testing['Timestamp'] = features_testing.apply(
    lambda row: pd.Timestamp(
        year=row['Date'].year, 
        month=row['Date'].month, 
        day=row['Date'].day, 
        hour=row['Hour']), 
    axis=1)
features_testing['Timestamp'] = pd.to_datetime(features_testing['Timestamp'])
features_testing.drop(columns=['Date', 'Hour'], inplace=True)

In [17]:
features_testing = features_testing.groupby(by=["Turbine_ID", "Timestamp"]).mean().reset_index()
assert(features_testing.isna().sum().sum() == 0)
features_testing.to_csv("./data/test_signals_hourly_processed.csv", index=False)