Setup Environment

In [None]:
# Importing the libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

# Import helper modules
sys.path.append("../src")
from utils import helpers
from optimization import GA_runner
# Paths
MODEL_DIR = "../models/single_lstm"
os.makedirs(MODEL_DIR, exist_ok=True)

Load Dataset

In [None]:
# Load dataset
dataset = helpers.get_dataset("apache_jackrabbit-oak.csv")
print("Dataset info:")
print(dataset.info())
print(dataset.head())

Split Train/Test

In [None]:
train_sets, test_sets = helpers.online_validation_folds(dataset)
print(f"Number of train folds: {len(train_sets)}, test folds: {len(test_sets)}")

Define Preprocessing

In [None]:
def train_preprocess(dataset_train, time_step):
    feature_cols = [col for col in dataset_train.columns if col not in ['gh_build_started_at', 'gh_project_name'] and dataset_train[col].dtype != 'O']
    training_set = dataset_train[feature_cols].values
    y = dataset_train['build_failed'].values

    scaler = MinMaxScaler()
    training_set = scaler.fit_transform(training_set)
    print(f"Scaled min/max: {training_set.min()}, {training_set.max()}")

    print("Class distribution BEFORE SMOTE:")
    print(pd.Series(y).value_counts(normalize=True))

    if helpers.CONFIG.get('WITH_SMOTE', True):
        smote = SMOTE(random_state=42)
        training_set, y = smote.fit_resample(training_set, y)
        print("Applied SMOTE.")

    print("Class distribution AFTER SMOTE:")
    print(pd.Series(y).value_counts(normalize=True))

    X_train = np.lib.stride_tricks.sliding_window_view(training_set, (time_step, training_set.shape[1]))[:-1]
    X_train = np.squeeze(X_train, axis=1)
    y_train = y[time_step:]

    print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
    return X_train, y_train, scaler

def test_preprocess(train_set, test_set, time_step, scaler):
    feature_cols = [col for col in train_set.columns if col not in ['gh_build_started_at', 'gh_project_name'] and train_set[col].dtype != 'O']
    train_scaled = scaler.transform(train_set[feature_cols])
    test_scaled = scaler.transform(test_set[feature_cols])
    dataset_total = np.vstack((train_scaled, test_scaled))
    inputs = dataset_total[-(len(test_set) + time_step):]

    X_test = np.lib.stride_tricks.sliding_window_view(inputs, (time_step, inputs.shape[1]))[:-1]
    X_test = np.squeeze(X_test, axis=1)
    y_test = test_set['build_failed'].values
    print("X_test shape:", X_test.shape, "y_test shape:", y_test.shape)
    return X_test, y_test

Define Model Trainer

In [None]:
def construct_lstm_model(network_params, train_set):
    X_train, y_train, scaler = train_preprocess(train_set, network_params['time_step'])
    drop = network_params['drop_proba']

    model = Sequential([
        Input(shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(units=network_params['nb_units'], return_sequences=(network_params['nb_layers'] > 1)),
        Dropout(drop),
        *[
            layer for i in range(1, network_params['nb_layers'])
            for layer in (LSTM(units=network_params['nb_units'], return_sequences=(i < network_params['nb_layers'] - 1)), Dropout(drop))
        ],
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=network_params['optimizer'], loss='binary_crossentropy', metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
    class_weight_dict = dict(enumerate(class_weights))

    history = model.fit(X_train, y_train, validation_split=0.2, epochs=network_params['nb_epochs'],
                        batch_size=network_params['nb_batch'], callbacks=[es], class_weight=class_weight_dict)

    # Plot loss curve
    plt.plot(history.history['loss'], label='train_loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.title("Loss over epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.show()

    validation_loss = np.min(history.history['val_loss'])
    entry = Utils.predict_lstm(model, X_train, y_train)
    entry['validation_loss'] = validation_loss

    model_path = os.path.join(MODEL_DIR, f"lstm_{network_params['nb_units']}_{network_params['nb_layers']}.keras")
    model.save(model_path)
    with open(os.path.join(MODEL_DIR, 'scaler.pkl'), 'wb') as f:
        pickle.dump(scaler, f)

    return {'validation_loss': validation_loss, 'model': model, 'entry': entry, 'scaler': scaler}

Run GA Tuning

In [None]:
params, model, result = None, None, None
start = timer()
all_params = {
    'drop_proba': list(np.linspace(0.01, 0.2, 5)),
    'nb_units': [32, 64],
    'nb_epochs': [5],
    'nb_batch': [16],
    'nb_layers': [1, 2],
    'optimizer': ['adam'],
    'time_step': list(range(5, 15))
}
params, model, result = GA_runner.generate(all_params, construct_lstm_model, train_sets[0])
end = timer()

print("Training complete in {:.2f}s".format(end - start))
print("Best parameters:", params)
print("Train Results:", result)

Evaluate on Test Set

In [None]:
X_test, y_test = test_preprocess(train_sets[0], test_sets[0], params['time_step'], result['scaler'])
test_result = Utils.predict_lstm(result['model'], X_test, y_test)
print("Test Results:", test_result)