In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
import plotly.express as px

In [None]:
df = pd.read_csv("BulkDataLoaderGroupedData_1336_2023_12_17_12_00_00_AM_2023_12_23_11_59_59_PM__WGN Dec23 data v2.csv")

column_list = ['Gearbox_Lube_Oil_Pressure_Status', 'Active_Power', 'Gen_RPM_CCU']

key = []
frames = []

# Feature extraction and processing for each turbine
for i in range(19):  # Turbines 50-68, excluding 60 and 61
    if i not in [10, 11]:  # Exclude turbines 60 and 61
        turbine_id = str(50 + i)
        turbine_prefix = f'GN1_GE23xxx_WTG00{turbine_id}_'
        
        # Initialize DataFrame for the turbine with 'pollTime'
        temp = pd.DataFrame(df['pollTime'])

        # Process specified columns
        for column_name in column_list:
            column_key = turbine_prefix + column_name
            if column_key in df.columns:
                temp[column_name] = pd.to_numeric(df[column_key], errors='coerce')
            else:
                temp[column_name] = pd.NA

        # Feature extraction on a per-turbine before stacking
        temp = temp[temp['Active_Power'] < 0]
        
        temp['Active_Power_Diff'] = temp['Active_Power'].diff()

        temp['Gearbox_Lube_Oil_Pressure_Status'] = pd.to_numeric(temp['Gearbox_Lube_Oil_Pressure_Status'], errors='coerce')

        temp['Gearbox_Lube_Oil_Rolling_AVG'] = temp['Gearbox_Lube_Oil_Pressure_Status'].rolling(window=180, min_periods=60).mean()

        temp['Gearbox_Lube_Oil_Rolling_AVG'] = temp['Gearbox_Lube_Oil_Rolling_AVG'].bfill()

        temp['Gen_RPM_CCU'] = temp['Gen_RPM_CCU'].bfill()


        temp.set_index('pollTime', inplace=True)

        # Append processed DataFrame to the list
        frames.append(temp)
        key.append(turbine_id)

# Concatenate all processed turbine dataframes
df_combined = pd.concat(frames, keys=key, names=['key', 'pollTime'])

df_combined.drop('Gearbox_Lube_Oil_Pressure_Status', axis=1, inplace=True)



In [None]:
#df_combined.index.get_level_values(0).unique()
#print(df_combined.loc['68'].head())

#print(df_combined.describe())
#df_combined.isna().sum() / len(df_combined)

df_combined = df_combined.dropna()

scaler = StandardScaler()
df_normalized = scaler.fit_transform(df_combined)

inertia = []
k_range = range(1, 20)  
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_normalized)
    inertia.append(kmeans.inertia_)

In [None]:
# Plotting the Elbow Method 
import matplotlib.pyplot as plt
plt.plot(k_range, inertia, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
chosen_k = 10 
kmeans = KMeans(n_clusters=chosen_k, random_state=42)
df_combined['Cluster'] = kmeans.fit_predict(df_normalized)

In [None]:
# try seasborn library, Kimberly Fassel on Youtube for information
# hvplot : plotly as backend  

df_combined.loc[:,['Active_Power', 'Gearbox_Lube_Oil_Rolling_AVG', 'Cluster']].plot(kind='line', figsize=(16,8))

In [None]:
# Plotting
plt.figure(figsize=(10, 8))
plt.hexbin(df_combined['Gen_RPM_CCU'], df_combined['Active_Power'], gridsize=30, cmap='viridis', C=df_combined['Cluster'])
plt.colorbar(label='Cluster')
plt.xlabel('Gen RPM')
plt.ylabel('Active Power')
plt.title('Hexbin plot of Active Power vs Gen RPM CCU with Cluster as hue')
plt.show()