In [None]:
# preprocess the data
def load_data(file_path, sequence_length=24*7, forecast_horizon=24):
    df = pd.read_csv(file_path)
    data = df['Radiation'].values.reshape(-1, 1)
    
    scaler = MinMaxScaler()
    data_normalized = scaler.fit_transform(data)
    
    X, y = [], []
    for i in range(len(data_normalized) - sequence_length - forecast_horizon):
        X.append(data_normalized[i:i+sequence_length])
        y.append(data_normalized[i+sequence_length:i+sequence_length+forecast_horizon])
    
    X = np.array(X)
    y = np.array(y)
    
    return train_test_split(X, y, test_size=0.2, shuffle=False)
 #data Transformer
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
# the optimized model building
def build_optimized_model(input_shape, forecast_horizon):
    inputs = Input(shape=input_shape)
    
    # Simplified CNN feature extraction (L2 regularization added)
    x = Conv1D(32, 3, activation='relu', padding='same', 
              kernel_regularizer=l2(1e-4))(inputs)
    x = Dropout(0.4)(x)
    
    # Simplified LSTM layer (using single-layer unidirectional LSTM)
    lstm_out = LSTM(64, return_sequences=True, 
                   kernel_regularizer=l2(1e-4))(x)
    lstm_out = Dropout(0.3)(lstm_out)
    
    # Lightweight Attention mechanism (replace original Transformer)
    attention = MultiHeadAttention(num_heads=4, key_dim=64)(lstm_out, lstm_out)
    attended = LayerNormalization(epsilon=1e-6)(lstm_out + attention)
    
    # Use hollow convolution strengthen temporal feature extraction
    dilated_conv = Conv1D(64, 3, padding='same', dilation_rate=2,
                         activation='relu')(attended)
    dilated_conv = Dropout(0.3)(dilated_conv)
    
    # Feature fusion (use addition instead of concatenation to reduce parameters)
    merged = attended + dilated_conv
    
    # feature selection(adaptive)
    gap = GlobalAveragePooling1D()(merged)
    gmp = GlobalMaxPooling1D()(merged)
    combined = concatenate([gap, gmp])
    
    # output layer
    outputs = Dense(forecast_horizon, kernel_regularizer=l2(1e-4))(combined)
    outputs = Reshape((forecast_horizon, 1))(outputs)
    
    return Model(inputs=inputs, outputs=outputs)


class MetricTracker(Callback):
    def __init__(self):
        super().__init__()
        self.metrics = {
            'evs': [],
            'val_evs': [],
            'val_mae': []
        }
    
    def on_epoch_end(self, epoch, logs=None):
        # Training set metrics
        train_pred = self.model.predict(X_train, verbose=0)
        y_true = y_train.reshape(-1, 1)
        y_pred = train_pred.reshape(-1, 1)
        evs = explained_variance_score(y_true, y_pred)
        self.metrics['evs'].append(evs)
        
        # Validation set metrics
        val_pred = self.model.predict(X_test, verbose=0)
        val_true = y_test.reshape(-1, 1)
        val_pred_flat = val_pred.reshape(-1, 1)
        val_evs = explained_variance_score(val_true, val_pred_flat)
        self.metrics['val_evs'].append(val_evs)
        self.metrics['val_mae'].append(logs['val_mae'])
        
        print(f"\nEpoch {epoch+1} - EVS: {evs:.4f} | Val EVS: {val_evs:.4f}")

# Use the correct learning rate scheduler initialization
initial_learning_rate = 1e-3
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.9,
    staircase=True)

model.compile(optimizer=Adam(learning_rate=initial_learning_rate),  # use initilized learning rate
             loss='mse',
             metrics=['mae'])
# Add learning rate update callback
class LRScheduler(Callback):
    def __init__(self, schedule):
        super().__init__()
        self.schedule = schedule
        self.optimizer = None
        
    def on_epoch_begin(self, epoch, logs=None):
        if not self.optimizer:
            self.optimizer = self.model.optimizer
        new_lr = self.schedule(epoch * BATCH_SIZE)
        tf.keras.backend.set_value(self.optimizer.learning_rate, new_lr)

# add callback in LRScheduler list
callbacks=[
    early_stopping,
    reduce_lr,
    SimplifiedTracker(),
    LRScheduler(lr_schedule)  # Added learning rate scheduling callback
]
