In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from keras.optimizers import Adam
from keras import layers, models , callbacks,losses ,regularizers
from sklearn.metrics import mean_absolute_error

In [78]:
data_x = pd.read_csv( 'engie_X.csv',
                   header    = 0,
                   # index_col = 0,
                   sep       = ';',
                  # decimal   = '.'
                  )
data_y = pd.read_csv('engie_Y.csv',
                      header = 0,
                      sep =';',
                      )


In [79]:
data_y.head()

Unnamed: 0,ID,TARGET
0,1,-0.703
1,2,-0.747
2,3,-0.791
3,4,-0.736
4,5,-1.055


In [80]:
data_x = data_x.drop(columns=['ID'])
data_x.columns

Index(['MAC_CODE', 'Date_time', 'Pitch_angle', 'Pitch_angle_min',
       'Pitch_angle_max', 'Pitch_angle_std', 'Hub_temperature',
       'Hub_temperature_min', 'Hub_temperature_max', 'Hub_temperature_std',
       'Generator_converter_speed', 'Generator_converter_speed_min',
       'Generator_converter_speed_max', 'Generator_converter_speed_std',
       'Generator_speed', 'Generator_speed_min', 'Generator_speed_max',
       'Generator_speed_std', 'Generator_bearing_1_temperature',
       'Generator_bearing_1_temperature_min',
       'Generator_bearing_1_temperature_max',
       'Generator_bearing_1_temperature_std',
       'Generator_bearing_2_temperature',
       'Generator_bearing_2_temperature_min',
       'Generator_bearing_2_temperature_max',
       'Generator_bearing_2_temperature_std', 'Generator_stator_temperature',
       'Generator_stator_temperature_min', 'Generator_stator_temperature_max',
       'Generator_stator_temperature_std', 'Gearbox_bearing_1_temperature',
       'Ge

In [81]:
data_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617386 entries, 0 to 617385
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      617386 non-null  int64  
 1   TARGET  617386 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 9.4 MB


In [82]:
missing_percentage = data_x.isna().mean() * 100

print('MISSING VALUES :')
if missing_percentage[missing_percentage != 0].empty:
    print('No')
else:
    print(missing_percentage[missing_percentage != 0].sort_values(ascending=False))

MISSING VALUES :
Grid_voltage                     16.411451
Grid_voltage_min                 16.411451
Grid_voltage_max                 16.411451
Grid_voltage_std                 16.411451
Generator_converter_speed         1.306152
Generator_converter_speed_min     1.306152
Gearbox_inlet_temperature_min     1.306152
Gearbox_inlet_temperature         1.306152
Generator_converter_speed_std     1.306152
Generator_converter_speed_max     1.306152
Gearbox_inlet_temperature_max     1.306152
Gearbox_inlet_temperature_std     1.306152
Absolute_wind_direction_c         0.011662
Nacelle_angle_c                   0.011662
dtype: float64


In [83]:
data_x.drop(columns=['Grid_voltage','Grid_voltage_min','Grid_voltage_max','Grid_voltage_std'], inplace=True)
data_x.columns

Index(['MAC_CODE', 'Date_time', 'Pitch_angle', 'Pitch_angle_min',
       'Pitch_angle_max', 'Pitch_angle_std', 'Hub_temperature',
       'Hub_temperature_min', 'Hub_temperature_max', 'Hub_temperature_std',
       'Generator_converter_speed', 'Generator_converter_speed_min',
       'Generator_converter_speed_max', 'Generator_converter_speed_std',
       'Generator_speed', 'Generator_speed_min', 'Generator_speed_max',
       'Generator_speed_std', 'Generator_bearing_1_temperature',
       'Generator_bearing_1_temperature_min',
       'Generator_bearing_1_temperature_max',
       'Generator_bearing_1_temperature_std',
       'Generator_bearing_2_temperature',
       'Generator_bearing_2_temperature_min',
       'Generator_bearing_2_temperature_max',
       'Generator_bearing_2_temperature_std', 'Generator_stator_temperature',
       'Generator_stator_temperature_min', 'Generator_stator_temperature_max',
       'Generator_stator_temperature_std', 'Gearbox_bearing_1_temperature',
       'Ge

In [84]:
try:
    # Replace missing values with mean for numeric columns
    numeric_cols = data_x.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if data_x[col].isnull().sum() > 0:
            data_x[col].fillna(data_x[col].mean(), inplace=True)
    
    print("Missing values replaced with mean successfully")
    print(f"Remaining missing values:\n{data_x.isnull().sum().sum()}")
except Exception as e:
    print(f"Error occurred while replacing missing values: {e}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_x[col].fillna(data_x[col].mean(), inplace=True)


Missing values replaced with mean successfully
Remaining missing values:
0


In [85]:
data_x['MAC_CODE'] = data_x['MAC_CODE'].map({'WT1': 1, 'WT2': 2, 'WT3': 3, 'WT4': 4})

print("MAC_CODE conversion complete:")
print(data_x['MAC_CODE'].unique())
print(data_x['MAC_CODE'].dtype)

MAC_CODE conversion complete:
[3 2 4 1]
int64


In [86]:
test_portion  = 1/5
valid_portion = 1/5

X_train_valid, X_test, y_train_valid, y_test = train_test_split(data_x, data_y['TARGET'], test_size=test_portion)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=valid_portion)

print('Dimensions de X_train :', X_train.shape)
print('Dimensions de X_valid :', X_valid.shape)
print('Dimensions de X_test  :', X_test.shape)

print('Dimensions de y_train :', y_train.shape)
print('Dimensions de y_valid :', y_valid.shape)
print('Dimensions de y_test  :', y_test.shape)

Dimensions de X_train : (395126, 73)
Dimensions de X_valid : (98782, 73)
Dimensions de X_test  : (123478, 73)
Dimensions de y_train : (395126,)
Dimensions de y_valid : (98782,)
Dimensions de y_test  : (123478,)


In [89]:
scaler = RobustScaler()
scaler.fit(X_train)

X_train_norm = scaler.transform(X_train)
X_valid_norm = scaler.transform(X_valid)
X_test_norm  = scaler.transform(X_test)

In [90]:
dim_inputs = X_train_norm.shape[1]

model = models.Sequential(name='WindTurbine_DNN')

# Input layer
model.add(layers.Input(shape=(dim_inputs,), name='Input'))

# First block - Large capacity
model.add(layers.Dense(
    512, 
    kernel_initializer='he_normal',
    kernel_regularizer=regularizers.l2(0.0001),
    name='Dense_1'
))
model.add(layers.BatchNormalization(name='BN_1'))
model.add(layers.Activation('relu', name='Activation_1'))
model.add(layers.Dropout(0.4, name='Dropout_1'))

# Second block
model.add(layers.Dense(
    256,
    kernel_initializer='he_normal',
    kernel_regularizer=regularizers.l2(0.0001),
    name='Dense_2'
))
model.add(layers.BatchNormalization(name='BN_2'))
model.add(layers.Activation('relu', name='Activation_2'))
model.add(layers.Dropout(0.3, name='Dropout_2'))

# Third block
model.add(layers.Dense(
    128,
    kernel_initializer='he_normal',
    kernel_regularizer=regularizers.l2(0.0001),
    name='Dense_3'
))
model.add(layers.BatchNormalization(name='BN_3'))
model.add(layers.Activation('relu', name='Activation_3'))
model.add(layers.Dropout(0.3, name='Dropout_3'))

# Fourth block
model.add(layers.Dense(
    64,
    kernel_initializer='he_normal',
    kernel_regularizer=regularizers.l2(0.0001),
    name='Dense_4'
))
model.add(layers.BatchNormalization(name='BN_4'))
model.add(layers.Activation('relu', name='Activation_4'))
model.add(layers.Dropout(0.2, name='Dropout_4'))

# Fifth block
model.add(layers.Dense(
    32,
    kernel_initializer='he_normal',
    name='Dense_5'
))
model.add(layers.Activation('relu', name='Activation_5'))

# Output layer
model.add(layers.Dense(1, activation='linear', name='Output'))

model.summary()


In [98]:
model.compile(
    optimizer=Adam(learning_rate=0.004),
    loss='mae',
    metrics=['mae', 'mse']
)

early_stop = callbacks.EarlyStopping(
    monitor='val_mae',
    patience=10,
    verbose=1,
    mode='min',
    restore_best_weights=True,
    start_from_epoch=10
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_mae',
    factor=0.5,
    patience=6,
    min_lr=1e-7,
    verbose=1,
    mode='min'
)

checkpoint = callbacks.ModelCheckpoint(
    'best_wind_turbine_model.keras',
    monitor='val_mae',
    save_best_only=True,
    mode='min',
    verbose=1
)


In [99]:
y_train_clipped = y_train.clip(lower=0)
y_val_clipped   = y_valid.clip(lower=0)

y_train_log = np.log1p(y_train_clipped)
y_val_log   = np.log1p(y_val_clipped)



In [100]:
history = model.fit(
    X_train_norm,
    y_train_log,
    batch_size=512,
    epochs=70,
    validation_data=(X_valid_norm, y_val_log),
    callbacks=[early_stop, reduce_lr, checkpoint],
    verbose=1
)

Epoch 1/70
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.2500 - mae: 0.2336 - mse: 0.2762
Epoch 1: val_mae improved from None to 0.64291, saving model to best_wind_turbine_model.keras

Epoch 1: finished saving model to best_wind_turbine_model.keras
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - loss: 0.2428 - mae: 0.2246 - mse: 0.2642 - val_loss: 0.6641 - val_mae: 0.6429 - val_mse: 1.3569 - learning_rate: 0.0040
Epoch 2/70
[1m769/772[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 0.2418 - mae: 0.2199 - mse: 0.2567
Epoch 2: val_mae improved from 0.64291 to 0.63622, saving model to best_wind_turbine_model.keras

Epoch 2: finished saving model to best_wind_turbine_model.keras
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - loss: 0.2431 - mae: 0.2204 - mse: 0.2562 - val_loss: 0.6604 - val_mae: 0.6362 - val_mse: 1.4247 - learning_rate: 0.0040
Epoch 3/70
[1m771/77

In [104]:
val_predictions = model.predict(X_valid_norm).ravel()
mae = mean_absolute_error(y_val_log, val_predictions)

print(f"Validation MAE: {mae:.5f}")


[1m3087/3087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
Validation MAE: 0.63598


In [105]:
model.save("engie_dnn_model.h5")

