In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("BulkDataLoaderGroupedData_1336_2023_12_17_12_00_00_AM_2023_12_23_11_59_59_PM__WGN Dec23 data v2.csv")

column_list = ['Gearbox_Lube_Oil_Pressure_Status', 'Active_Power', 'Gen_RPM_CCU']

key = []
frames = []

# Process for each turbine, including the initial setup for turbine ID 50
for i in range(19):  # Loop to include turbines 50-68, excluding 60 and 61
    # Exclude turbines 60 and 61
    if i != 9 and i != 10:
        turbine_id = str(i + 50) if i >= 0 else '50'
        turbine_prefix = f'GN1_GE23xxx_WTG00{turbine_id}_'
        
        # Initialize DataFrame with pollTime, to be used as index later
        temp = pd.DataFrame(df['pollTime'])
        
        # Copy data for the specified columns and convert non-numeric data to NaN
        for column_name in column_list:
            column_key = turbine_prefix + column_name
            if column_key in df.columns:
                # Use pd.to_numeric() with errors='coerce' to convert non-numeric values to NaN
                temp[column_name] = pd.to_numeric(df[column_key], errors='coerce')
            else:
                # Initialize the column with NaN values if not present in the original DataFrame
                temp[column_name] = pd.NA
        
        # Filter the DataFrame to only include rows where Active_Power is negative
        temp_filtered = temp[temp['Active_Power'] < 0]
        
        key.append(turbine_id)
        frames.append(temp_filtered)

# Combine all filtered turbine data
frames_cleaned = [frame.dropna(axis=1, how='all') for frame in frames]  # Drop columns that are all NA in each DataFrame
df_combined = pd.concat(frames_cleaned, keys=key)

# Setting 'pollTime' as the index of the combined DataFrame
df_combined.reset_index(inplace=True) # Reset index to manipulate 'pollTime'
df_combined['pollTime'] = pd.to_datetime(df_combined['pollTime'], errors='coerce', utc=True) # Ensure datetime conversion
df_combined.set_index('pollTime', inplace=True) # Set 'pollTime' back as the index





In [None]:
df_combined['Active_Power_Diff'] = df_combined['Active_Power'].diff()

df_combined = df_combined.dropna(subset=['Active_Power_Diff'])


In [None]:
nan_counts = df_combined['Gearbox_Lube_Oil_Pressure_Status'].isna().resample('30T').sum()
#print(nan_counts)

nan_threshold = 181

# Create a mask for acceptable data based on your threshold
mask = nan_counts <= nan_threshold
#print(mask.describe())

# Aggregate data with resample and apply mask for NaN count threshold
mean_values = df_combined['Gearbox_Lube_Oil_Pressure_Status'].resample('30T').mean()[mask]

# Reindex or merge this back to your original DataFrame as needed, ensuring alignment
# For example, using reindex like:
df_combined['Gearbox_Lube_Oil_Freq'] = mean_values.reindex(df_combined.index, method='nearest')

df_combined.dropna(subset=['Gearbox_Lube_Oil_Freq'], inplace=True)

In [None]:
# Selecting only the relevant columns for K-means
df_kmeans = df_combined[['Active_Power', 'Active_Power_Diff', 'Gen_RPM_CCU', 'Gearbox_Lube_Oil_Freq']]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data for standardization
df_kmeans_scaled = scaler.fit_transform(df_kmeans)

# Now df_kmeans_scaled is an array; convert it back to DataFrame if you need DataFrame operations
df_kmeans_scaled = pd.DataFrame(df_kmeans_scaled, columns=df_kmeans.columns, index=df_kmeans.index)

"""
k_range = range(2, 15)
elbow_values = []


for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(df_kmeans_scaled)
    
    # For the elbow method
    elbow_values.append(kmeans.inertia_)

# Plotting the Elbow Method
plt.figure(figsize=(10, 5))
plt.plot(k_range, elbow_values, 'bo-', label='Sum of squared distances')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k')
plt.legend()
plt.show()
"""

In [None]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(df_kmeans_scaled)

# Getting the cluster labels
cluster_labels = kmeans.labels_

# If you want to add the cluster labels back to the original DataFrame
df_kmeans['Cluster'] = cluster_labels