In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.impute import SimpleImputer
import plotly.express as px

In [None]:
df = pd.read_csv("BulkDataLoaderGroupedData_1336_2023_12_17_12_00_00_AM_2023_12_23_11_59_59_PM__WGN Dec23 data v2.csv")

patterns = [
    "Active_Power",
    "Gearbox_Lube_Oil_Pressure_Status",
    "Gen_RPM_CCU",
    "L1_Current"
]

# Identify columns matching the patterns
columns_for_all_turbines = [col for col in df.columns if any(pattern in col for pattern in patterns)]

# Filter the dataframe based on these columns
df_filtered_all_turbines = df[columns_for_all_turbines]

df = df_filtered_all_turbines

df = df.apply(pd.to_numeric, errors='coerce')


active_power_columns = [col for col in df.columns if "Active_Power" in col]
for col in active_power_columns:
    df[col] = df[col].where(df[col] <= 0)

Gearbox_Lube_Oil_Pressure_Status_Columns = [col for col in df.columns if "Gearbox_Lube_Oil_Pressure_Status" in col]


df = df.dropna(subset=active_power_columns)

for col in active_power_columns:
    df[col + "_diff"] = df[col].diff()

window_size = 30

for col in Gearbox_Lube_Oil_Pressure_Status_Columns:
    df[col + "_rolling_avg"] = df[col].rolling(window = window_size).sum() / window_size



In [None]:
imputer = SimpleImputer(strategy='mean')
df_filled = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

scaler = StandardScaler()
df_normalized = scaler.fit_transform(df_filled)

inertia = []
k_range = range(1, 20)  
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_normalized)
    inertia.append(kmeans.inertia_)

"""
# Plotting the Elbow Method 
import matplotlib.pyplot as plt
plt.plot(k_range, inertia, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()
"""
"""
# Calculate Silhouette Score for each k
for k in k_range[1:]:  # silhouette score can't be calculated for k=1
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_normalized)
    score = silhouette_score(df_normalized, kmeans.labels_)
    print(f"Silhouette Score for k={k}: {score}")
"""


chosen_k = 7 
kmeans = KMeans(n_clusters=chosen_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_normalized)


In [None]:
clusterData = df[df['Cluster'] == 6]

clusterData.head(10)