# **Import dependencies**

In [None]:
import os

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from flasc.dataframe_operations import (
    dataframe_filtering as dff,
    dataframe_manipulations as dfm,
)
from flasc.turbine_analysis.find_sensor_faults import filter_sensor_faults
from flasc.turbine_analysis import ws_pow_filtering as wspf

from {{cookiecutter.project_slug}}.models import load_floris

In [None]:
# User settings
save_figures = True
plot_figures_in_notebook = True

# **Step 0**: Initial data pulldown
First, we import the data from the common_windfarm_information folder. This may take a while, so we keep these variables unchanged. These are df_scada_raw and df_metmast_raw. These variables are not manipulated throughout the script.

In [None]:
def load_data():
    root_path = os.getcwd()
    source_path = os.path.join(root_path, "..", "..", "common_windfarm_information")
    df_scada_raw = pd.read_csv(
        os.path.join(source_path, "demo_dataset_scada_600s.csv")
    )
    df_metmast_raw = pd.read_csv(
        os.path.join(source_path, "demo_dataset_metmast_600s.csv")
    )

    # Drop first and "empty" column with indices
    df_scada_raw = df_scada_raw.drop(df_scada_raw.columns[0], axis=1)
    df_metmast_raw = df_metmast_raw.drop(df_metmast_raw.columns[0], axis=1)

    print("Columns available in df_scada_raw: {}.".format(list(df_scada_raw.columns)))
    return df_scada_raw, df_metmast_raw

df_scada_raw, df_metmast_raw = load_data()

# **Step 1**: Format to common FLASC format
Now create a copy of df_scada_raw and df_metmast_raw which we can manipulate and filter.

In [None]:
# Now make a copy of the raw data files for processing and manipulation
df_scada = df_scada_raw.copy()
df_metmast = df_metmast_raw.copy()

Format df_scada to pour the dataframe into the common FLASC format. For example, wind speeds are columns denoted by ws_{ti}, with {ti} the turbine number with prevailing zeros. Hence, for wind speed for the third turbine is defined by ws_002, and the power production of the thirteenth turbine is defined by pow_012.

In [None]:
def format_dataframes(df_scada, df_metmast):
    # Format columns and data. The operations required differ per dataset.
    df_scada["time"] = pd.to_datetime(df_scada["time"])  # Convert strings to timestamps
    df_metmast["time"] = pd.to_datetime(df_metmast["time"])  # Convert strings to timestamps

    # In FLORIS, turbines are numbered from 0 to nturbs - 1. In SCADA data,
    # turbines often have a different name. We save the mapping between
    # the turbine indices in FLORIS and the turbine names to a separate .csv
    # file.
    root_path = os.getcwd()
    out_path = os.path.join(root_path, "postprocessed")
    os.makedirs(out_path, exist_ok=True)
    turbine_names = ["A1", "A2", "A3", "B1", "B2", "C1", "C2"]
    pd.DataFrame({"turbine_names": turbine_names}).to_csv(
        os.path.join(out_path, "turbine_names.csv")
    )

    # Now map columns to conventional format
    scada_dict = {}
    for ii, tn in enumerate(turbine_names):
        scada_dict.update(
            {
                "ActivePower_{:s}".format(tn): "pow_{:03d}".format(ii),
                "NacWSpeed_{:s}".format(tn): "ws_{:03d}".format(ii),
                "NacTI_{:s}".format(tn): "ti_{:03d}".format(ii),
                "NacWDir_{:s}".format(tn): "wd_{:03d}".format(ii),
                "is_operation_normal_{:s}".format(tn): "is_operation_normal_{:03d}".format(ii),
            }
        )

    df_list = []
    print("formatting dataframe...")
    df_scada = df_scada.rename(columns=scada_dict)

    # Reduce precision in dataframe to use half of the memory
    df_scada = dfm.df_reduce_precision(df_scada, verbose=True)
    df_metmast = dfm.df_reduce_precision(df_metmast, verbose=True)

    # Sort dataframe and save
    df_scada = df_scada.sort_values(axis=0, by="time")
    df_scada = df_scada.reset_index(drop=True)
    print("Columns available in df_scada: {}.".format(list(df_scada.columns)))

    return df_scada, df_metmast

print(df_scada.columns)
df_scada, df_metmast = format_dataframes(df_scada, df_metmast)

# **Step 2**: Remove outliers using basic logic
We remove outliers from the data using simple logic rules, e.g., wind speeds below 0 m/s, power productions below 0 W, and turbine flags that are provided with the data.

In [None]:
def remove_outliers(df_scada):
    # Now, check for simple outliers in the data
    root_path = os.getcwd()
    out_path = os.path.join(root_path, "postprocessed")
    figs_path = os.path.join(out_path, "figures", "02_basic_filters")
    os.makedirs(figs_path, exist_ok=True)

    # Basic filters: address self flags and obviously wrong points
    num_turbines = dfm.get_num_turbines(df_scada)
    for ti in range(num_turbines):
        # Specify filtering conditions
        conds = [
            ~df_scada["is_operation_normal_{:03d}".format(ti)],  # Self-status
            df_scada["ws_{:03d}".format(ti)] <= 0.0,  # Non-negative wind speeds
            df_scada["pow_{:03d}".format(ti)] <= 0.0,
        ]  # Non-negative powers

        # Retrieve a single, combined condition array
        conds_combined = conds[0]
        for cond in conds:
            conds_combined = conds_combined | cond

        # Plot time vs filtered data
        fig, ax = dff.plot_highlight_data_by_conds(df_scada, conds, ti)
        ax.legend(
            ["All data", "Bad self-status", "Negative WS", "Negative power"]
        )

        if save_figures:
            fp = os.path.join(figs_path, "basic_filtering_%03d.png" % ti)
            # print("Saving figure for turbine {:03d}.".format(ti))
            fig.savefig(fp, dpi=200)

            if not plot_figures_in_notebook:
                plt.close(fig)

        # Apply filtering to dataframe
        df_scada = dff.df_mark_turbdata_as_faulty(
            df_scada, conds_combined, ti, verbose=True
        )

    # Remove unnecessary columns after filtering
    self_status_cols = [
        "is_operation_normal_%03d" % ti for ti in range(num_turbines)
    ]
    df_scada = df_scada.drop(columns=self_status_cols)  # Remove self status columns

    return df_scada

df_scada = remove_outliers(df_scada)

# **Step 3**: Removing sensor-stuck type of faults

In [None]:
def remove_sensor_stuck_faults(df, plot_figures, save_figures):
    # Decide how figures should be saved
    if save_figures:
        root_path = os.getcwd()
        figure_path = os.path.join(root_path, "postprocessed", "figures", "03_sensor_faults_filtered")
    else:
        figure_path = None

    # Filter sensor faults
    df_scada = filter_sensor_faults(
        df=df,
        columns=["wd", "ws"],
        plot_figures=plot_figures,
        figure_save_path=figure_path
    )
    return df_scada

df_scada = remove_sensor_stuck_faults(
    df=df_scada,
    plot_figures=plot_figures_in_notebook,
    save_figures=save_figures
)

# **Step 4**: Deal with wind-speed power curve filtering

In [None]:
def filter_by_ws_pow_curve(df, plot_figures, save_figures):
    # Load the FLORIS model for the wind farm. This is not used for anything
    # besides plotting the floris-predicted wind speed-power curve on top
    # of the actual data.
    fi = load_floris()

    # Downsample data. Not necessary here, but can be useful if we have 1 Hz
    # data available. Namely, it's hard to detect outliers on such a high
    # resolution. Instead, we are better off downsampling the data to 60s or
    # even 600s and filter the data based on decisions there. The following
    # downsampled dataframe should then be inserted into the wind speed power
    # curve filtering class. Mapping the filtering back to the high-resolution
    # data is done by a couple lines of code as found at the end of this
    # script.
    #
    # df_movavg, data_indices_mapping = top.df_movingaverage(
    #     df_in=df_1s,
    #     cols_angular=[
    #         c for c in df_1s.columns if (
    #             ("vane_" in c) or
    #             ("yaw_" in c) or
    #             ("wd_" in c) or
    #             ("direction" in c)
    #         )
    #     ],
    #     window_width=td(seconds=600),
    #     calc_median_min_max_std=False,
    #     return_index_mapping=True,
    # )

    # Create output directory
    if (plot_figures) & (save_figures):
        root_path = os.getcwd()
        figure_save_path = os.path.join(root_path, "postprocessed", "figures", "04_wspowcurve_filtered")
        os.makedirs(figure_save_path, exist_ok=True)
    else:
        figure_save_path = None

    # Initialize the wind-speed power curve filtering class
    turbine_list = "all"
    # turbine_list = [5]  # Can also look at specific turbines
    ws_pow_filtering = wspf.ws_pw_curve_filtering(
        df=df, turbine_list=turbine_list, rated_powers=5000.0
    )

    # Add a window: all data to the left or right of this window is bad
    # This is an easy way to remove curtailment if the default filtering
    # methods do not or insufficiently pick up these outliers.
    ws_pow_filtering.window_add(
        ws_range=[0.0, 10.2],
        pow_range=[3100.0, 3200.0],
        axis=0,
        turbines="all",
    )
    ws_pow_filtering.filter_by_windows()

    # Now filter by deviations from the median power curve
    ws_pow_filtering.filter_by_power_curve()

    # Plot and save data for current dataframe
    ws_pow_filtering.plot_outliers_vs_time(save_path=figure_save_path)
    ws_pow_filtering.plot(fi=fi, save_path=figure_save_path)

    # Get filtered dataframe and power curve
    df = ws_pow_filtering.get_df()
    df_pow_curve = ws_pow_filtering.pw_curve_df

    return df, df_pow_curve


df_scada, df_pow_curve = filter_by_ws_pow_curve(
    df=df_scada,
    plot_figures=plot_figures_in_notebook,
    save_figures=save_figures
)

# Extract and save turbine power curves estimated from the data
root_path = os.getcwd()
out_path = os.path.join(root_path, "postprocessed")
df_pow_curve.to_csv(os.path.join(out_path, "power_curves.csv"))


# **Step 5**: Plot faults vs. the layout

In [None]:
def plot_faults_vs_layout(df):
    fi = load_floris()

    layout_x = fi.layout_x
    layout_y = fi.layout_y

    num_turbines = dfm.get_num_turbines(df)
    fault_ratio = np.zeros(num_turbines)
    for ti in range(num_turbines):
        fault_ratio[ti] = (
            dff.df_get_no_faulty_measurements(df, ti) / df.shape[0]
        )

    # Plot layout and colormap
    fig, ax = plt.subplots(figsize=(14, 5))
    for ti in range(num_turbines):
        clr = [fault_ratio[ti], 1.0 - fault_ratio[ti], 0.0]
        ax.plot(
            layout_x[ti],
            layout_y[ti],
            "o",
            markersize=15,
            markerfacecolor=clr,
            markeredgewidth=0.0,
        )
        ax.text(
            layout_x[ti] + 100,
            layout_y[ti],
            "T%03d (%.1f%%)" % (ti, (1.0 - fault_ratio[ti]) * 100.0),
            color="black",
        )
    fig.tight_layout()

    root_path = os.getcwd()
    out_path = os.path.join(
        root_path,
        "data",
        "05_preliminary_fault_analysis",
        "show_filtered_faults_by_layout",
    )
    fig_out = os.path.join(out_path, "faults_by_layout.png")
    print("Saving figure to {:s}.".format(fig_out))
    os.makedirs(out_path, exist_ok=True)
    plt.savefig(fig_out, dpi=300)

plot_faults_vs_layout(df_scada)

# **Save postprocessed data** to a local file

In [None]:
root_path = os.getcwd()
fout = os.path.join(root_path, "postprocessed", "df_scada_600s_wspowfiltered.ftr")
df_scada.to_feather(fout)
print("File saved to '{:s}'.".format(os.path.relpath(fout)))

In [None]:
# Print all (remaining) figures
plt.show()