In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pickle
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, confusion_matrix
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import load_model
from tensorflow.python.keras.layers.kernelized import RandomFourierFeatures

import time

# Load data

In [None]:
data_dir = "data/classification/dr3"
def load_data(data_dir):
    list_5d = []
    list_6d = []
    for file in sorted(os.listdir(data_dir)):
        if "apg" in file:
            continue
        df = pd.read_csv(os.path.join(data_dir, file), index_col=0)
        if "5d" in file:
            list_5d.append(df)
        else:
            list_6d.append(df)
            
    df_5d = pd.concat(list_5d, axis=0, ignore_index=True)
    df_6d = pd.concat(list_6d, axis=0, ignore_index=True)
    return df_5d, df_6d

In [None]:
df_5d, df_6d = load_data(data_dir)

df_6d["class_6d"] = df_6d["class"]
df_6d["gm_p_high_6d"] = df_6d["gm_p_high"]

df = pd.merge(df_5d, df_6d[["Host", "class_6d", "gm_p_high_6d"]], on="Host")

In [None]:
df

# EDA

In [None]:
def label_groups(row):
    if row["class"] == 0 and row["class_6d"] == 2:
        return 0
    elif row["class"] == 2 and row["class_6d"] == 0:
        return 1
    elif row["class"] == 0 and row["class_6d"] == 1:
        return 2
    elif row["class"] == 1 and row["class_6d"] == 0:
        return 3
    elif row["class"] == 2 and row["class_6d"] == 1:
        return 4
    elif row["class"] == 1 and row["class_6d"] == 2:
        return 5
    elif row["class"] == 0 and row["class_6d"] == 0:
        return 6
    elif row["class"] == 1 and row["class_6d"] == 1:
        return 7
    elif row["class"] == 2 and row["class_6d"] == 2:
        return 8

In [None]:
def pie_groups(row):
    if row["class_group"] <= 1:
        return 0
    elif 5 >= row["class_group"] >= 2:
        return 1
    else:
        return 2

In [None]:
df["class_group"] = df.apply(lambda row: label_groups(row), axis=1)
groups = ["5d low density - 6d high density", "6d low density - 5d high density",
          "5d low density - 6d ambigous", "5d ambigous - 6d low density",
          "5d high density - 6d ambigous", "5d ambigous - 6d high density",
          "5d low density - 6d low density", "5d ambigous - 6d ambigous",
          "5d high density - 6d high density"]

In [None]:
pie, ax = plt.subplots(figsize=[10,6], facecolor="w")
ax = sns.countplot(y=df["class_group"])
ax.set_yticklabels(labels=groups)
plt.ylabel("")
plt.xlabel("Count")
plt.tight_layout()
plt.savefig("report_images/density_class_prediction_breakdown.png");

In [None]:
df["pie_group"] = df.apply(lambda row: pie_groups(row), axis=1)
outs = df[df["pie_group"] == 0]["Host"]
pie_groups = ["Matching classes", "Missmatched as ambigous", "Mismatched density classes"]

In [None]:
pie, ax = plt.subplots(figsize=[9,6], facecolor="w")
plt.pie(x=df["pie_group"].value_counts(), autopct="%.1f%%", labels=pie_groups, pctdistance=0.5)
plt.title("", fontsize=14)
plt.tight_layout()
plt.savefig("report_images/density_class_prediction_comparison.png", bbox_inches = "tight");

In [None]:
plt.figure(figsize=(25, 15), facecolor="white")
mask = np.triu(np.ones_like(df.drop(["Host", "class"], axis=1).corr(), dtype=bool))
cmap = ["#f781bf", "#a65628", "#ffff33", "#ff7f00", "#984ea3", "#4daf4a", "#377eb8","#e41a1c"]
sns.heatmap(df.drop(["Host", "class"], axis=1).corr(), mask=mask, vmin=-1, vmax=1, cmap=cmap,square=True, linewidths=.5, annot=False)
plt.savefig("report_images/corr_graph.png", bbox_inches="tight", pad_inches=0.05);

# Helping functions for model training

### Encoding data for neural networks

In [None]:
def encode_for_nn(y_train, y_val=None):
    depth = tf.constant(3)
    y_train = tf.one_hot(indices=y_train, depth=depth)
    if y_val is None:
        return y_train
    y_val = tf.one_hot(indices=y_val, depth=depth)
    
    return y_train, y_val#, y_test

### Scaling data for convolutional neural network input

In [None]:
def cnn_scale(x_train, x_val, x_test):
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    x_val = sc.transform(x_val)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))
    
    return x_train, x_val, x_test

### Create train/test sets

In [None]:
def split_data(train_df, regression=False):
    
    if regression:
        output = train_df["gm_p_high_6d"]
    else:
        output = train_df["class_6d"]
    
    x_train = train_df.iloc[20000:, :]
    y_train = output[20000:]
    x_test = train_df.iloc[:20000, :]
    y_test = output[:20000]

    hosts = x_test["Host"]
    x_train = x_train[["target_density", "densities_max", "densities_min", "densities_mean", "gm_cov_high", "gm_mean_high"]]
    x_test = x_test[["target_density", "densities_max", "densities_min", "densities_mean", "gm_cov_high", "gm_mean_high"]]

    return x_train, x_test, y_train, y_test, hosts

### Return classification performance metrics of the model

In [None]:
def get_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns):

    start = time.perf_counter()
    if "Random Forest" in model_name:
        predictions = model.predict(x_test)
    else:
        predictions = np.argmax(model.predict(x_test), axis=-1)
    end = time.perf_counter()
    pred_time = end-start

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions, average="macro")
    test_recall = recall_score(y_test, predictions, average="macro")
    test_f1 = f1_score(y_test, predictions, average="macro")
    
    df = pd.DataFrame([[model_name, features, params, test_accuracy, test_precision, test_recall, test_f1, train_time, pred_time]], columns=columns)
    return df, predictions

### Return regression performance metrics

In [None]:
def get_reg_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns):

    start = time.perf_counter()
    
    predictions = model.predict(x_test)
 
    end = time.perf_counter()
    pred_time = end-start

    mse_score = mse(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    df = pd.DataFrame([[model_name, features, params, mse_score, r2, train_time, pred_time]], columns=columns)
    return df, predictions

### Random Forest model setups

In [None]:
# Classification
def rfc(x_train, y_train):
    model = RandomForestClassifier()
    old = model.get_params()
    model.set_params(n_estimators=100, max_depth=25, class_weight="balanced", n_jobs=-1)
    new = model.get_params()
    
    start = time.perf_counter()
    model.fit(x_train, y_train)
    end = time.perf_counter()
    t = end-start
    
    params = {k: new[k] for k in new if k in old and new[k] != old[k]}
    
    return model, params, t, "Random Forest"

In [None]:
# Regression
def rfr(x_train, y_train):
    model = RandomForestRegressor()
    old = model.get_params()
    model.set_params(n_jobs=-1)
    new = model.get_params()
    
    start = time.perf_counter()
    model.fit(x_train, y_train)
    end = time.perf_counter()
    t = end-start
    
    params = {k: new[k] for k in new if k in old and new[k] != old[k]}
    
    return model, params, t, "Random Forest"

### Convolutional neural network architectures

In [None]:
# Classification
def cnn_1d(x_train, y_train, x_val, y_val, params):
    params = []
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    stopper = EarlyStopping(monitor="val_loss", patience=5)
    n_timesteps, n_features, n_outputs = x_train.shape[1], x_train.shape[2], 3
    activation = "relu"
    epochs = 100
    
    model = Sequential()
    model.add(Input(shape=(n_timesteps, n_features)))
    params.append(f"Input: ({n_timesteps}, {n_features})")
    model.add(Conv1D(filters=256, kernel_size=4, activation=activation))
    params.append(f"Conv1D: filters=256, kernel_size=4, activation={activation}")

    model.add(MaxPooling1D(pool_size=3))
    params.append(f"MaxPooling1D: pool_size=3")
    model.add(Flatten())
    params.append(f"Flatten")
    model.add(Dense(units=128, activation=activation))
    params.append(f"Dense: units=128, activation={activation}")
    model.add(Dense(units=64, activation=activation))
    params.append(f"Dense: units=64, activation={activation}")
    model.add(Dense(units=3, activation="softmax"))
    params.append(f"Dense: units=3, activation=softmax")
    model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["categorical_accuracy"])
    params.append(f"Compile: Optimizer=Adam(learning_rate=0.0001), loss=categorical_crossentropy, metrics=categorical_accuracy")
    params.append(f"Args: Epochs={epochs}, Callbacks=EarlyStopping(monitor=val_loss, patience=5)")
    
    start = time.perf_counter()
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, callbacks=stopper, verbose=0)
    end = time.perf_counter()
    t = end-start
    
    return model, history, params, t, "1dCNN"

In [None]:
# Regression
def cnn_1d_reg(x_train, y_train, x_val, y_val, params):
    tf.keras.callbacks.History()
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    stopper = EarlyStopping(monitor="val_loss", patience=5)
    n_timesteps, n_features, n_outputs = x_train.shape[1], x_train.shape[2], 1
    activation = "relu"
    epochs = 100
    
    model = Sequential()
    model.add(Input(shape=(n_timesteps, n_features)))
    params.append(f"Input: ({n_timesteps}, {n_features})")
    model.add(Conv1D(filters=256, kernel_size=3, activation=activation))
    params.append(f"Conv1D: filters=256, kernel_size=3, activation={activation}")
    model.add(Conv1D(filters=32, kernel_size=3, activation=activation))
    params.append(f"Conv1D: filters=32, kernel_size=3, activation={activation}")
    model.add(MaxPooling1D(pool_size=2))
    params.append(f"MaxPooling1D: pool_size=2")
    model.add(Flatten())
    params.append(f"Flatten")
    model.add(Dense(units=64, activation=activation))
    params.append(f"Dense: units=64, activation={activation}")
    model.add(Dense(units=32, activation=activation))
    params.append(f"Dense: units=32, activation={activation}")

    model.add(Dense(units=n_outputs, activation="sigmoid"))
    params.append(f"Dense: units={n_outputs}, activation=sigmoid")
    model.compile(optimizer=opt, loss="mse")
    params.append(f"Compile: Optimizer=Adam(learning_rate=0.0001), loss=mse")
    params.append(f"Args: Epochs={epochs}, Callbacks=EarlyStopping(monitor=val_loss, patience=5)")
    
    start = time.perf_counter()
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, callbacks=stopper, verbose=0)
    end = time.perf_counter()
    t = end-start
    
    return model, history, params, t, "1dCNN"

### Classification neural network with RFF layer architecture

In [None]:
# Classification
def ann(x_train, y_train):
    params = []
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

    stopper = EarlyStopping(monitor="val_loss", patience=5)
    n_features = x_train.shape[1]
    activation = "relu"
    epochs = 100
    
    model = tf.keras.Sequential()
    model.add(Input(shape=(n_features, )))
    params.append(f"Input: ({n_features}, )")
    model.add(RandomFourierFeatures(output_dim=16384, scale=3., kernel_initializer="laplacian"))
    params.append(f"RandomFourierFeatures: output_dim=16384, scale=3., kernel_initializer=laplacian")
    
    model.add(Dense(units=64, activation=activation))
    params.append(f"Dense: units=128")
    model.add(Dense(units=16, activation=activation))
    params.append(f"Dense: units=64")
    
    model.add(Dense(units=3, activation="softmax"))
    params.append(f"Dense: units=3, activation=softmax")
    
    model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["categorical_accuracy"])
    
    params.append(f"Compile: Optimizer=Adam(learning_rate=0.0001), loss=categorical_crossentropy, metrics=categorical_accuracy")
    params.append(f"Args: Epochs={epochs}, Callbacks=EarlyStopping(monitor=val_loss, patience=5)")
    
    start = time.perf_counter()
    history = model.fit(x_train, y_train, validation_split=0.1, epochs=epochs, callbacks=stopper, verbose=0)
    end = time.perf_counter()
    t = end-start
    
    params.append(f"Validation split: 0.1")
    
    return model, history, params, t, "ANN"

In [None]:
# Regression
def ann_reg(x_train, y_train):
    tf.keras.callbacks.History()
    params = []
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

    stopper = EarlyStopping(monitor="val_loss", patience=5)
    n_features = x_train.shape[1]
    activation = "relu"
    epochs = 100
    
    model = tf.keras.Sequential()
    model.add(Input(shape=(n_features, )))
    params.append(f"Input: ({n_features}, )")
    model.add(RandomFourierFeatures(output_dim=16384, scale=2., kernel_initializer="laplacian"))
    params.append(f"RandomFourierFeatures: output_dim=16384, scale=2., kernel_initializer=laplacian")
    
    model.add(Dense(units=64, activation=activation))
    params.append(f"Dense: units=64")
    model.add(Dense(units=16, activation=activation))
    params.append(f"Dense: units=16")
    
    model.add(Dense(units=1, activation="sigmoid"))
    params.append(f"Dense: units=1, activation=sigmoid")
    model.compile(optimizer=opt, loss="mse")
    params.append(f"Compile: Optimizer=Adam(learning_rate=0.0001), loss=mse")
    params.append(f"Args: Epochs={epochs}, Callbacks=EarlyStopping(monitor=val_loss, patience=5)")
    
    start = time.perf_counter()
    history = model.fit(x_train, y_train, validation_split=0.1, epochs=epochs, callbacks=stopper, verbose=0)
    end = time.perf_counter()
    t = end-start
    
    params.append(f"Validation split: 0.1")
    
    return model, history, params, t, "ANN"

# Classification models training

### Each training session is done 10 times and the mean of all results is saved

In [None]:
columns = ["Classifier", "Features", "Parameters", "Accuracy", "Precision", "Recall", "F1-score", "Train Duration", "Predict Duration"]
set_size = df_5d.shape[0]
results_dir = "results/classification"
results_file = "results" + "_" + str(set_size) + ".csv"
iters = 1
results = pd.DataFrame(columns=columns)

### Random Forest training pipeline

In [None]:
def train_rf(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None

    j = 0
    for file in os.listdir("saved_models"):
        if "RF_model" in file:
            j += 1

    for i in range(iters):
        x_train, x_test, y_train, y_test, hosts = split_data(df)
        features = x_test.columns.values
        model, params, train_time, model_name = rfc(x_train, y_train)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["F1-score"][0] > best_score:
            best_score = score["F1-score"][0]
            best_pred = predictions
            best_model = model

    cm = confusion_matrix(y_test, best_pred)

    fig = plt.figure(figsize=(7, 7), facecolor="w")
    ticklabels = ["Underdensity", "Ambigous", "Overdensity"]
    sns.heatmap(cm, yticklabels=ticklabels, xticklabels=ticklabels, cmap="viridis", center=0, annot=True, fmt="g", cbar=False, annot_kws={"size": 15})
    plt.savefig(f"report_images/RF_matrix_{j}.png")
    plt.close()
    pickle.dump(best_model, open(f"saved_models/RF_model_{j}.sav", "wb"))

    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_rf(results)

### 1dCNN training pipeline

In [None]:
def train_1dcnn(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None
    best_history = None
    
    j = 0
    for file in os.listdir("saved_models/1DCNN"):
        if "1DCNN_model" in file:
            j += 1
    
    for i in range(iters):
        params = []
        x_train, x_test, y_train, y_test, hosts = split_data(df)
        features = x_test.columns.values
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
        params.append("Validation split: 0.2")
        y_train, y_val = encode_for_nn(y_train, y_val)
        x_train, x_val, x_test = cnn_scale(x_train, x_val, x_test)
        model, history, params, train_time, model_name = cnn_1d(x_train, y_train, x_val, y_val, params)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["F1-score"][0] > best_score:
            best_score = score["F1-score"][0]
            best_pred = predictions
            best_model = model
            best_history = history

    cm = confusion_matrix(y_test, best_pred)
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    ticklabels = ["Underdensity", "Ambigous", "Overdensity"]
    sns.heatmap(cm, yticklabels=ticklabels, xticklabels=ticklabels, cmap="viridis", center=0, annot=True, fmt="g", cbar=False, annot_kws={"size": 15})
    plt.savefig(f"report_images/1DCNN/1DCNN_matrix_{j}.png")
    #plt.show()
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["categorical_accuracy"])
    plt.plot(best_history.history["val_categorical_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.ylim(0.89, 0.935)
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/1DCNN/1DCNN_accuracy_{j}.png")
    #plt.show()
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["loss"])
    plt.plot(best_history.history["val_loss"])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.ylim(0.16, 0.26)
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/1DCNN/1DCNN_loss_{j}.png")
    plt.close()
    model.save(f"saved_models/1DCNN/1DCNN_model_{j}")

    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_1dcnn(results)

### ANN training pipeline

In [None]:
def train_ann(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None
    best_history = None
    
    j = 0
    for file in os.listdir("saved_models/ANN"):
        if "ANN_model" in file:
            j += 1
    
    for i in range(iters):
        x_train, x_test, y_train, y_test, hosts = split_data(df)
        features = x_test.columns.values
        y_train= encode_for_nn(y_train)
        model, history, params, train_time, model_name = ann(x_train, y_train)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["F1-score"][0] > best_score:
            best_score = score["F1-score"][0]
            best_pred = predictions
            best_model = model
            best_history = history

    cm = confusion_matrix(y_test, best_pred)
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    ticklabels = ["Underdensity", "Ambigous", "Overdensity"]
    sns.heatmap(cm, yticklabels=ticklabels, xticklabels=ticklabels, cmap="viridis", center=0, annot=True, fmt="g", cbar=False, annot_kws={"size": 15})
    plt.savefig(f"report_images/ANN/ANN_matrix_{j}.png")
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["categorical_accuracy"])
    plt.plot(best_history.history["val_categorical_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.ylim(0.89, 0.935)
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/ANN/ANN_accuracy_{j}.png")
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["loss"])
    plt.plot(best_history.history["val_loss"])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.ylim(0.16, 0.26)
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/ANN/ANN_loss_{j}.png")
    plt.close()
    
    model.save(f"saved_models/ANN/ANN_model_{j}")

    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_ann(results)

### Save classification results

In [None]:
if Path(os.path.join(results_dir, results_file)).is_file():
    results.to_csv(os.path.join(results_dir, results_file), mode="a", header=False, index=False)
else:
    results.to_csv(os.path.join(results_dir, results_file), index=False)

# Regression models training

In [None]:
columns = ["Regressor", "Features", "Parameters", "MSE", "R2 score", "Train Duration", "Predict Duration"]
set_size = df_5d.shape[0]
results_dir = "results/regression"
results_file = "results" + "_" + str(set_size) + ".csv"
iters = 1
results = pd.DataFrame(columns=columns)

### Random Forest training pipeline

In [None]:
def train_rf(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None

    j = 0
    for file in os.listdir("saved_models/RFR"):
        if "RF_model" in file:
            j += 1
    
    for i in range(iters):
        x_train, x_test, y_train, y_test, hosts = split_data(df, regression=True)
        features = x_test.columns.values
        model, params, train_time, model_name = rfr(x_train, y_train)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_reg_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["R2 score"][0] > best_score:
            best_score = score["R2 score"][0]
            best_pred = predictions
            best_model = model

    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.scatter(y_test, predictions)
    plt.xlabel("Prediction")
    plt.ylabel("Ground truth")
    plt.savefig(f"report_images/RFR/RF_loss_{j}.png")
    plt.close()
    
    pickle.dump(best_model, open(f"saved_models/RFR/RF_model_{j}.sav", "wb"))

    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_rf(results)

In [None]:
def train_1dcnn(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None
    best_history = None
    
    j = 0
    for file in os.listdir("saved_models/1DCNN_reg"):
        if "1DCNN_model" in file:
            j += 1
    
    for i in range(iters):
        params = []
        x_train, x_test, y_train, y_test, hosts = split_data(df, regression=True)
        features = x_test.columns.values
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
        params.append("Validation split: 0.2")
        x_train, x_val, x_test = cnn_scale(x_train, x_val, x_test)
        model, history, params, train_time, model_name = cnn_1d_reg(x_train, y_train, x_val, y_val, params)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_reg_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["R2 score"][0] > best_score:
            best_score = score["R2 score"][0]
            best_pred = predictions
            best_model = model
            best_history = history
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.scatter(y_test, predictions)
    plt.xlabel("Prediction")
    plt.ylabel("Ground truth")
    plt.savefig(f"report_images/1DCNN_reg/1DCNN_preds_{j}.png")
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["loss"])
    plt.plot(best_history.history["val_loss"])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/1DCNN_reg/1DCNN_loss_{j}.png")
    plt.close()
    model.save(f"saved_models/1DCNN_reg/1DCNN_model_{j}")

    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_1dcnn(results)

In [None]:
def train_ann(results):
    scores = pd.DataFrame(columns=columns)
    best_score = 0
    best_pred = None
    best_model = None
    best_history = None
    
    j = 0
    for file in os.listdir("saved_models/ANN_reg"):
        if "ANN_model" in file:
            j += 1
    
    for i in range(iters):
        x_train, x_test, y_train, y_test, hosts = split_data(df, regression=True)
        features = x_test.columns.values
        model, history, params, train_time, model_name = ann_reg(x_train, y_train)
        model_name = model_name + f" {str(j)}"
        score, predictions = get_reg_scores(model, model_name, features, params, x_test, y_test, hosts, train_time, columns)
        scores = scores.append(score, ignore_index=True)
        if score["R2 score"][0] > best_score:
            best_score = score["R2 score"][0]
            best_pred = predictions
            best_model = model
            best_history = history

    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.scatter(y_test, predictions)
    plt.xlabel("Prediction")
    plt.ylabel("Ground truth")
    plt.savefig(f"report_images/ANN_reg/ANN_preds_{j}.png")
    plt.close()
    
    fig = plt.figure(figsize=(7, 5), facecolor="w")
    plt.plot(best_history.history["loss"])
    plt.plot(best_history.history["val_loss"])
    plt.title("model loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.legend(["training", "validation"], loc="upper left")
    plt.savefig(f"report_images/ANN_reg/ANN_loss_{j}.png")
    plt.close()
    model.save(f"saved_models/ANN_reg/ANN_model_{j}")
    
    results = results.append([pd.concat([scores.iloc[0, :3], scores.mean()], axis=0)])
    
    return results

In [None]:
results = train_ann(results)

### Save regression results

In [None]:
if Path(os.path.join(results_dir, results_file)).is_file():
    results.to_csv(os.path.join(results_dir, results_file), mode="a", header=False, index=False)
else:
    results.to_csv(os.path.join(results_dir, results_file), index=False)