Setup Environment

In [None]:
# Importing the libraries
import os
import sys
import numpy as np
import pandas as pd
from timeit import default_timer as timer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Input
from keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import matplotlib.pyplot as plt
from hyperopt import hp, fmin, tpe, rand, Trials, STATUS_OK
import optunity
import ConfigSpace as CS
from hpbandster.core.worker import Worker
from hpbandster.optimizers import BOHB

# Import helper modules
project_root = os.path.abspath(os.path.join(os.getcwd(), "../src"))
sys.path.append(project_root)
from helpers import Utils
from optimization.GA_runner import GARunner as GARunner

# Paths
MODEL_DIR = "../models/stacked_lstm"
os.makedirs(MODEL_DIR, exist_ok=True)
DATASET_DIR = "../data/processed"

# Define the model parameters
CONFIG = {
    'MAX_EVAL': 10,
    'NBR_REP': 1
}

COLUMNS_RES = ["proj", "algo", "iter", "AUC", "accuracy", "F1", "exp"]
MODEL_NAME = "lstm"

Define preprocessing functions

In [None]:
def train_preprocess(dataset_train, time_step):
    feature_cols = [col for col in dataset_train.columns 
                    if col not in ['build_failed', 'gh_build_started_at', 'gh_project_name'] 
                    and dataset_train[col].dtype in [np.float64, np.float32, np.int64, np.int32]]
    training_set = dataset_train[feature_cols].values
    y = dataset_train['build_failed'].values

    # Limit time_step to the length of the training set
    if len(training_set) < time_step:
        print(f"Adjusting time_step from {time_step} to {len(training_set) - 1}")
        time_step = max(1, len(training_set) - 1)

    if Utils.CONFIG['WITH_SMOTE'] and len(np.unique(y)) > 1:
        print("\nClass Distribution BEFORE SMOTE:")
        unique, counts = np.unique(y, return_counts=True)
        dist = dict(zip(unique, counts / len(y)))
        print(dist)

    if Utils.CONFIG['WITH_SMOTE']:
        print("\nApplying SMOTE...")
        smote = SMOTE(random_state=42)
        X, y_smote = smote.fit_resample(training_set, y)
        training_set = X
    else:
        y_smote = y

    if Utils.CONFIG['WITH_SMOTE'] and len(np.unique(y_smote)) > 1:
        print("Class Distribution AFTER SMOTE:")
        unique, counts = np.unique(y_smote, return_counts=True)
        dist = dict(zip(unique, counts / len(y_smote)))
        print(dist)

    try:
        X_train = np.lib.stride_tricks.sliding_window_view(
            training_set, (time_step, training_set.shape[1])
        )[:-1]
        X_train = np.squeeze(X_train, axis=1)
        y_train = y_smote[time_step:]
    except Exception as e:
        raise RuntimeError(f"Error during sliding window creation: {e}")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    return X_train, y_train

def test_preprocess(dataset_train, dataset_test, time_step):
    feature_cols = [col for col in dataset_train.columns 
                    if col not in ['build_failed', 'gh_build_started_at', 'gh_project_name'] 
                    and dataset_train[col].dtype in [np.float64, np.float32, np.int64, np.int32]]
    train_data = dataset_train[feature_cols].values
    test_data = dataset_test[feature_cols].values
    dataset_total = np.vstack((train_data, test_data))
    y_test = dataset_test['build_failed'].values
    
    if len(dataset_total) < time_step + len(dataset_test):
        raise ValueError("Not enough data for test sequences")
    
    inputs = dataset_total[-len(dataset_test) - time_step:]
    X_test = np.lib.stride_tricks.sliding_window_view(inputs, (time_step, inputs.shape[1]))[:-1]
    X_test = np.squeeze(X_test, axis=1)
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    return X_test, y_test

Define the function to evaluate the model

In [None]:
def construct_lstm_model(network_params, train_set):
    start_time = timer()
    # Construct and train the LSTM model.
    X_train, y_train = train_preprocess(train_set, network_params["time_step"])
    drop = network_params["drop_proba"]
    
    model = Sequential()
    
    # First LSTM layer with input shape
    model.add(LSTM(units=network_params["nb_units"],
                   return_sequences=(network_params["nb_layers"] > 1),
                   input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(drop))
    
    # Additional LSTM layers if specified
    for i in range(1, network_params["nb_layers"]):
        is_last = (i == network_params["nb_layers"] - 1)
        model.add(LSTM(units=network_params["nb_units"], return_sequences=not is_last))
        model.add(Dropout(drop))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # print(f"Model summary:\n{model.summary()}")
    
    model.compile(optimizer=network_params["optimizer"], loss='binary_crossentropy', metrics=["accuracy"])
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
    class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

    try:
        history = model.fit(X_train, y_train, epochs=network_params["nb_epochs"],
                        batch_size=network_params["nb_batch"], validation_split=0.2,
                        verbose=0, callbacks=[es], class_weight=class_weight_dict)
        validation_loss = np.amin(history.history['val_loss'])
    except Exception as e:
        print(f"Error during model training: {e}")
        return {"validation_loss": float('inf'), "model": None, "entry": {'F1': 0, 'validation_loss': float('inf')}}

    entry = Utils.predict_lstm(model, X_train, y_train)
    entry['validation_loss'] = validation_loss

    end_time = timer()
    print(f"\nTraining time: {end_time - start_time:.2f} seconds")
    
    # model_path = os.path.join(MODEL_DIR, f"lstm_{network_params['nb_units']}_{network_params['nb_layers']}.keras")
    # model.save(model_path)
    # print(f"Model saved: {model_path}")

    return {'validation_loss': validation_loss, 'model': model, 'entry': entry}

def train_lstm_with_hyperopt(network_params):
    # Train LSTM with hyperopt.
    if 'data' not in globals():
        raise ValueError("Global 'data' not set. Ensure evaluate_tuner sets it correctly.")
    res = construct_lstm_model(network_params, globals()['data'])
    return {'loss': res['validation_loss'], 'status': STATUS_OK}

def convert_from_PSO(network_params):
    # Convert PSO parameters to appropriate types.
    for key in network_params:
        if key == 'optimizer':
            network_params[key] = 'adam' if int(network_params[key]) == 1 else 'rmsprop'
        elif key == 'nb_layers':
            network_params[key] = int(network_params[key])
    return network_params

def fn_lstm_pso(drop_proba=0.01, nb_units=32, nb_epochs=2, nb_batch=4, nb_layers=1, optimizer=1, time_step=30):
    # Function for PSO optimization.
    optimizer = 'adam' if int(optimizer) == 1 else 'rmsprop'
    network_params = {
        'nb_units': int(nb_units),
        'nb_layers': int(nb_layers),
        'optimizer': optimizer,
        'time_step': int(time_step),
        'nb_epochs': int(nb_epochs),
        'nb_batch': int(nb_batch),
        'drop_proba': drop_proba
    }
    if 'data' not in globals():
        raise ValueError("Global 'data' not set.")
    res = construct_lstm_model(network_params, globals()['data'])
    return 1 - float(res["validation_loss"])

class LSTMWorker(Worker):
    def __init__(self, train_set, **kwargs):
        super().__init__(**kwargs)
        self.train_set = train_set

    def compute(self, config, budget, **kwargs):
        res = construct_lstm_model(config, self.train_set)
        return {'loss': res['validation_loss'], 'info': {}}

def evaluate_tuner(tuner_option, train_set):
    # Evaluate the specified tuner.
    global data
    data = train_set

    # Define explicit parameter space for GA
    all_possible_params = {
        'drop_proba': list(np.linspace(0.01, 0.21, 20)),
        'nb_units': [32, 64],
        'nb_epochs': [4, 5, 6],
        'nb_batch': [4, 8, 16, 32, 64], # Power of 2
        'nb_layers': [1, 2, 3, 4],
        'optimizer': ['adam', 'rmsprop'],
        'time_step': list(range(30, 61))
    }

    start = timer()

    if tuner_option == "ga":
        ga_runner = GARunner()
        best_params, best_model, entry_train = ga_runner.generate(all_possible_params, construct_lstm_model, data)

    elif tuner_option == "tpe":
        param_space = {k: hp.choice(k, v) for k, v in all_possible_params.items()}
        trials = Trials()
        best = fmin(train_lstm_with_hyperopt, param_space, algo=tpe.suggest, max_evals=CONFIG.get('MAX_EVAL'), trials=trials)
        best_params = {k: all_possible_params[k][v] for k, v in best.items()}
        res = construct_lstm_model(best_params, data)
        entry_train, best_model = res["entry"], res["model"]

    elif tuner_option == "pso":
        params_PSO = {
            'nb_units': [all_possible_params['nb_units'][0], all_possible_params['nb_units'][-1]],
            'nb_layers': [all_possible_params['nb_layers'][0], all_possible_params['nb_layers'][-1]],
            'optimizer': [1, 2],  # 1: adam, 2: rmsprop
            'time_step': [all_possible_params['time_step'][0], all_possible_params['time_step'][-1]],
            'nb_epochs': [all_possible_params['nb_epochs'][0], all_possible_params['nb_epochs'][-1]],
            'nb_batch': [all_possible_params['nb_batch'][0], all_possible_params['nb_batch'][-1]],
            'drop_proba': [all_possible_params['drop_proba'][0], all_possible_params['drop_proba'][-1]]
        }
        best_params, _, _ = optunity.maximize_structured(fn_lstm_pso, params_PSO, num_evals=CONFIG.get('MAX_EVAL'))
        best_params = convert_from_PSO(best_params)
        res = construct_lstm_model(best_params, data)
        entry_train, best_model = res["entry"], res["model"]
        
    elif tuner_option == "bohb":
        config_space = CS.ConfigurationSpace()
        config_space.add(CS.UniformIntegerHyperparameter('nb_units', lower=32, upper=64))
        config_space.add(CS.UniformIntegerHyperparameter('nb_layers', lower=1, upper=4))
        config_space.add(CS.CategoricalHyperparameter('optimizer', choices=['adam', 'rmsprop']))
        config_space.add(CS.UniformIntegerHyperparameter('time_step', lower=30, upper=60))
        config_space.add(CS.UniformIntegerHyperparameter('nb_epochs', lower=4, upper=6))
        config_space.add(CS.UniformIntegerHyperparameter('nb_batch', lower=4, upper=64))
        config_space.add(CS.UniformFloatHyperparameter('drop_proba', lower=0.01, upper=0.2))

        import hpbandster.core.nameserver as hpns
        NS = hpns.NameServer(run_id="LSTM", host='127.0.0.1', port=None)
        NS.start()
        w = LSTMWorker(train_set=data, nameserver='127.0.0.1', run_id="LSTM")
        w.run(background=True)
        bohb = BOHB(configspace=config_space, run_id="LSTM", nameserver='127.0.0.1', min_budget=1, max_budget=CONFIG.get('NBR_SOL'))
        res = bohb.run(n_iterations=CONFIG.get('NBR_GEN'))
        best = res.get_incumbent_id()
        best_params = res.get_id2config_mapping()[best]['config']
        res = construct_lstm_model(best_params, data)
        entry_train, best_model = res["entry"], res["model"]
        bohb.shutdown(shutdown_workers=True)
        NS.shutdown()

    elif tuner_option == "rs":
        param_space = {k: hp.choice(k, v) for k, v in all_possible_params.items()}
        trials = Trials()
        best = fmin(train_lstm_with_hyperopt, param_space, algo=rand.suggest, max_evals=CONFIG.get('MAX_EVAL', trials=trials))
        best_params = {k: all_possible_params[k][v] for k, v in best.items()}
        res = construct_lstm_model(best_params, data)
        entry_train, best_model = res["entry"], res["model"]

    elif tuner_option == "default":
        best_params = {
            'nb_units': 64, 'nb_layers': 3, 'optimizer': 'adam', 'time_step': 30,
            'nb_epochs': 10, 'nb_batch': 64, 'drop_proba': 0.1
        }
        res = construct_lstm_model(best_params, data)
        entry_train, best_model = res["entry"], res["model"]

    end = timer()
    entry_train.update({"time": end - start, "params": best_params, "model": best_model})
    # best_model_path = os.path.join(MODEL_DIR, f"best_lstm_{proj_name}_fold{fold_idx}_iter{iter_idx}.keras")
    # best_model.save(best_model_path)
    # print(f"Best model saved at: {best_model_path}")
    return entry_train

LSTM Model Evaluation

In [None]:
def plot_metrics(train_entries, test_entries, title):
    train_df = pd.DataFrame(train_entries)[COLUMNS_RES]
    test_df = pd.DataFrame(test_entries)[COLUMNS_RES]
    
    print(f"\n{title} - Train Results:")
    print(train_df.groupby(['proj', 'exp']).mean())
    print(f"\n{title} - Test Results:")
    print(test_df.groupby(['proj', 'exp']).mean())

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    for i, metric in enumerate(['AUC', 'accuracy', 'F1']):
        test_df.boxplot(column=metric, by='proj', ax=axes[i])
        axes[i].set_title(f"{metric} ({title})")
        axes[i].set_xlabel("Project")
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.show()

def run_online_validation(tuner="ga", dataset_dir=DATASET_DIR):
    # Run online validation and plot results.
    all_train_entries = []
    all_test_entries = []

    print(f"Loading datasets from {dataset_dir}...")
    dataset_sizes = {}
    
    # Get number of rows for each dataset
    for f in os.listdir(dataset_dir):
        try:
            df = Utils.get_dataset(f, dataset_dir)
            dataset_sizes[f] = len(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
    
    # Sort and select top 10 largest datasets
    top_10_files = sorted(dataset_sizes.items(), key=lambda x: x[1], reverse=True)[:10]
    top_10_files = [f for f, _ in top_10_files]
    
    # Load only the top 10 datasets
    datasets = {}
    for f in top_10_files:
        try:
            datasets[f] = Utils.get_dataset(f, dataset_dir)
            print(f"Loaded {f} with {len(datasets[f])} samples")
        except Exception as e:
            print(f"Error loading {f}: {e}")
    
    print(f"Selected top 10 datasets: {list(datasets.keys())}")
    if not datasets:
        raise ValueError(f"No datasets found in {dataset_dir}")
        
    # for f in os.listdir(dataset_dir):
    #     try:
    #         datasets[f] = Utils.get_dataset(f, dataset_dir)
    #         print(f"Loaded {f} with {len(datasets[f])} samples")
    #     except Exception as e:
    #         print(f"Error loading {f}: {e}")
    # print(f"Loaded datasets: {list(datasets.keys())}")
    # if not datasets:
    #     raise ValueError(f"No datasets found in {dataset_dir}")

    for file_name, dataset in datasets.items():
        best_f1 = -1
        best_model_path = None
        train_sets, test_sets = Utils.online_validation_folds(dataset)
        for fold_idx, (train_set, test_set) in enumerate(zip(train_sets, test_sets)):
            for iteration in range(1, CONFIG['NBR_REP'] + 1):
                print(f"\n[Proj {file_name} | Fold {fold_idx+1} | Iter {iteration}] Training...")
                entry_train = evaluate_tuner(tuner, train_set)
                entry_train.update({
                    "iter": iteration, "proj": f"proj{file_name}", "exp": fold_idx + 1, "algo": MODEL_NAME
                })
                all_train_entries.append(entry_train)

                best_model = entry_train["model"]
                best_params = entry_train["params"]
                X_test, y_test = test_preprocess(train_set, test_set, best_params["time_step"])
                entry_test = Utils.predict_lstm(best_model, X_test, y_test)
                entry_test.update({
                    "iter": iteration, "proj": file_name, "exp": fold_idx + 1, "algo": MODEL_NAME
                })
                if entry_test["F1"] > best_f1:
                    best_f1 = entry_test["F1"]
                    best_model_path = os.path.join(MODEL_DIR, f"best_stacked_lstm_{file_name}.keras")
                    best_model.save(best_model_path)
                print(f"Test metrics: {entry_test}")
                all_test_entries.append(entry_test)
        print(f"Best model for {file_name} saved at: {best_model_path}, F1: {best_f1}")

    test_df = pd.DataFrame(all_test_entries)
    proj_scores = test_df.groupby('proj')[['F1', 'AUC', 'accuracy']].mean()
    print("\nAverage Test Metrics by Project:")
    print(proj_scores)
    bellwether = proj_scores['F1'].idxmax()
    print(f"\nSelected Bellwether: {bellwether} (Best F1: {proj_scores.loc[bellwether, 'F1']:.4f})")

    # Plot the results
    plot_metrics(all_train_entries, all_test_entries, "Online Validation")
    return datasets[bellwether], datasets
    
def run_cross_project_validation(bellwether_dataset, all_datasets, tuner="ga"):
    # Run cross-project validation and plot results.
    all_train_entries = []
    all_test_entries = []

    for iteration in range(1, CONFIG['NBR_REP'] + 1):
        print(f"[Cross-Project | Iter {iteration}] Training on Bellwether...")
        entry_train = evaluate_tuner(tuner, bellwether_dataset)
        best_model = entry_train["model"]
        best_params = entry_train["params"]
        entry_train.update({
            "iter": iteration, "proj": "bellwether", "algo": MODEL_NAME, "exp": 1
        })
        all_train_entries.append(entry_train)

        for file_name, test_set in all_datasets.items():
            if test_set is not bellwether_dataset:
                best_f1 = -1
                best_model_path = None
                print(f"Testing on {file_name}...")
                X_test, y_test = test_preprocess(bellwether_dataset, test_set, best_params["time_step"])
                entry_test = Utils.predict_lstm(best_model, X_test, y_test)
                entry_test.update({
                    "iter": iteration, "proj": file_name, "exp": 1, "algo": MODEL_NAME
                })
                
                if entry_test["F1"] > best_f1:
                    best_f1 = entry_test["F1"]
                    best_model_path = os.path.join(MODEL_DIR, f"best_stacked_lstm_{file_name}_cross_iter{iteration}.keras")
                    best_model.save(best_model_path)
                    print(f"Best model for {file_name} saved at: {best_model_path}, F1: {best_f1}")
                print(f"Test metrics: {entry_test}")
                all_test_entries.append(entry_test)

    plot_metrics(all_train_entries, all_test_entries, "Cross-Project Validation")

Run the online validation and select the bellwether

In [None]:
print("Running Online Validation and Selecting Bellwether...")
bellwether_dataset, all_datasets = run_online_validation(tuner="ga", dataset_dir=DATASET_DIR)

Run cross-project validation with the selected bellwether

In [None]:
print("\nRunning Cross-Project Validation with Selected Bellwether...")
run_cross_project_validation(bellwether_dataset, all_datasets, tuner="ga")