In [37]:
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras import layers, models , callbacks,losses 

In [20]:
data_x = pd.read_csv( 'engie_X.csv',
                   header    = 0,
                   # index_col = 0,
                   sep       = ';',
                  # decimal   = '.'
                  )
data_y = pd.read_csv('engie_Y.csv',
                      header = 0,
                      sep =';',
                      )


In [21]:
data_y.head()

Unnamed: 0,ID,TARGET
0,1,-0.703
1,2,-0.747
2,3,-0.791
3,4,-0.736
4,5,-1.055


In [22]:
data_x = data_x.drop(columns=['ID'])
data_x.columns

Index(['MAC_CODE', 'Date_time', 'Pitch_angle', 'Pitch_angle_min',
       'Pitch_angle_max', 'Pitch_angle_std', 'Hub_temperature',
       'Hub_temperature_min', 'Hub_temperature_max', 'Hub_temperature_std',
       'Generator_converter_speed', 'Generator_converter_speed_min',
       'Generator_converter_speed_max', 'Generator_converter_speed_std',
       'Generator_speed', 'Generator_speed_min', 'Generator_speed_max',
       'Generator_speed_std', 'Generator_bearing_1_temperature',
       'Generator_bearing_1_temperature_min',
       'Generator_bearing_1_temperature_max',
       'Generator_bearing_1_temperature_std',
       'Generator_bearing_2_temperature',
       'Generator_bearing_2_temperature_min',
       'Generator_bearing_2_temperature_max',
       'Generator_bearing_2_temperature_std', 'Generator_stator_temperature',
       'Generator_stator_temperature_min', 'Generator_stator_temperature_max',
       'Generator_stator_temperature_std', 'Gearbox_bearing_1_temperature',
       'Ge

In [23]:
data_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617386 entries, 0 to 617385
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      617386 non-null  int64  
 1   TARGET  617386 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 9.4 MB


In [24]:
missing_percentage = data_x.isna().mean() * 100

print('MISSING VALUES :')
if missing_percentage[missing_percentage != 0].empty:
    print('No')
else:
    print(missing_percentage[missing_percentage != 0].sort_values(ascending=False))

MISSING VALUES :
Grid_voltage                     16.411451
Grid_voltage_min                 16.411451
Grid_voltage_max                 16.411451
Grid_voltage_std                 16.411451
Generator_converter_speed         1.306152
Generator_converter_speed_min     1.306152
Gearbox_inlet_temperature_min     1.306152
Gearbox_inlet_temperature         1.306152
Generator_converter_speed_std     1.306152
Generator_converter_speed_max     1.306152
Gearbox_inlet_temperature_max     1.306152
Gearbox_inlet_temperature_std     1.306152
Absolute_wind_direction_c         0.011662
Nacelle_angle_c                   0.011662
dtype: float64


In [26]:
try:
    # Replace missing values with mean for numeric columns
    numeric_cols = data_x.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if data_x[col].isnull().sum() > 0:
            data_x[col].fillna(data_x[col].mean(), inplace=True)
    
    print("Missing values replaced with mean successfully")
    print(f"Remaining missing values:\n{data_x.isnull().sum().sum()}")
except Exception as e:
    print(f"Error occurred while replacing missing values: {e}")

Missing values replaced with mean successfully
Remaining missing values:
0


In [27]:
data_x['MAC_CODE'] = data_x['MAC_CODE'].map({'WT1': 1, 'WT2': 2, 'WT3': 3, 'WT4': 4})

print("MAC_CODE conversion complete:")
print(data_x['MAC_CODE'].unique())
print(data_x['MAC_CODE'].dtype)

MAC_CODE conversion complete:
[3 2 4 1]
int64


In [28]:
test_portion  = 1/5
valid_portion = 1/5

X_train_valid, X_test, y_train_valid, y_test = train_test_split(data_x, data_y['TARGET'], test_size=test_portion)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=valid_portion)

print('Dimensions de X_train :', X_train.shape)
print('Dimensions de X_valid :', X_valid.shape)
print('Dimensions de X_test  :', X_test.shape)

print('Dimensions de y_train :', y_train.shape)
print('Dimensions de y_valid :', y_valid.shape)
print('Dimensions de y_test  :', y_test.shape)

Dimensions de X_train : (395126, 77)
Dimensions de X_valid : (98782, 77)
Dimensions de X_test  : (123478, 77)
Dimensions de y_train : (395126,)
Dimensions de y_valid : (98782,)
Dimensions de y_test  : (123478,)


In [29]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_norm = scaler.transform(X_train)
X_valid_norm = scaler.transform(X_valid)
X_test_norm  = scaler.transform(X_test)

In [30]:
dim_inputs  = (X_train.shape[1],)
dim_outputs = 1

n_units_hl1 = 100
n_units_hl2 = 80
n_units_hl3 = 20

model = models.Sequential(name='DNN')

model.add(layers.Input(shape=dim_inputs, name='Inputs'))

model.add(layers.Dense(units=n_units_hl1, activation='relu', name='Hidden_layer_1'))
model.add(layers.Dropout(rate=0.15, name='Dropout_1'))

model.add(layers.Dense(units=n_units_hl2, activation='relu', name='Hidden_layer_2'))
model.add(layers.Dropout(rate=0.15, name='Dropout_2'))


model.add(layers.Dense(units=dim_outputs, activation='sigmoid', name='Output_layer'))

model.summary()

In [40]:
import keras


callback =callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    verbose=1,
    mode="min",
    restore_best_weights=True,
    start_from_epoch=0,
)

model.compile(optimizer = 'adam',
              loss      = losses.MeanSquaredError(
                        reduction="sum_over_batch_size", name="mean_squared_error", dtype=None),
              metrics   = ['accuracy'])

In [41]:
hist = model.fit(X_train_norm,
                 y_train,
                 batch_size      = 500,
                 epochs          = 200,
                 validation_data = (X_valid_norm, y_valid),
                 callbacks=[callback],
                 verbose         = 1)

Epoch 1/200
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 357933.4688 - val_accuracy: 0.0000e+00 - val_loss: 357015.5938
Epoch 2/200
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 357933.4688 - val_accuracy: 0.0000e+00 - val_loss: 357015.5938
Epoch 3/200
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 357933.3438 - val_accuracy: 0.0000e+00 - val_loss: 357015.5938
Epoch 4/200
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 357933.3438 - val_accuracy: 0.0000e+00 - val_loss: 357015.5938
Epoch 5/200
[1m597/791[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 359291.2343

KeyboardInterrupt: 