<a href="https://colab.research.google.com/github/NatePGroves/Code_Examples/blob/main/Final_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

csc_480_project_1_2024fall_path = kagglehub.competition_download('csc-480-project-1-2024fall')

print('Data source import complete.')


# MODEL 1

In [None]:
## setup

from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from tensorflow.keras.callbacks import EarlyStopping

def load_dataset(csv_path, hmi_folder_path):
    ## load dataframe and hmi_data
    df = pd.read_csv(csv_path)
    hmi_folder_path = Path(hmi_folder_path)
    hmi_path = df["HMI_img"].apply(lambda x: hmi_folder_path / x)
    hmi_data = np.zeros((len(hmi_path), 224, 224))
    for i in range(len(hmi_path)):
        hmi_data[i] = np.load(hmi_path[i])
    return df, hmi_data

In [None]:
## data

# training
csv_path = "/kaggle/input/csc-480-project-1-2024fall/train_df.csv"
hmi_folder_path = "/kaggle/input/csc-480-project-1-2024fall/HMI_data/hmi_data/"
df, hmi_data = load_dataset(csv_path, hmi_folder_path)

# normalizing
x_train = hmi_data / 255.
y_train = df[["tsi_pert_0"]].values

# timestamps
timestamps = df["timestamp"].values
timestamps = timestamps - np.min(timestamps)
timestamps = timestamps / np.max(timestamps)

# validation
csv_path2 = "/kaggle/input/csc-480-project-1-2024fall/test_df.csv"
df2, hmi_data2 = load_dataset(csv_path2, hmi_folder_path)
x_valid = hmi_data2 / 255.
y_valid = df2[["tsi_pert_0"]].values
timestamps_valid = df2["timestamp"].values
timestamps_valid = timestamps_valid - np.min(timestamps_valid)
timestamps_valid = timestamps_valid / np.max(timestamps_valid)

In [None]:
# cnn
image_input = keras.Input(shape=(224, 224, 1))  # Image input
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(image_input)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(300, activation="relu")(x)
x = layers.Dense(100, activation="relu")(x)
x = layers.Dense(50, activation="relu")(x)

In [None]:
# timestamp input
timestamp_input = keras.Input(shape=(1,))  # Timestamp input
y = layers.Dense(50, activation="relu")(timestamp_input)

merged = layers.concatenate([x, y])

In [None]:
# final model
output = layers.Dense(1, activation="linear")(merged)

model = keras.Model(inputs=[image_input, timestamp_input], outputs=output)

model.compile(optimizer='adam', loss='mae', metrics=['mae'])

In [None]:
# early stopping

early_stopping = EarlyStopping(
    monitor='val_mae',
    patience=5,
    mode='min',
    restore_best_weights=True
)

In [None]:
# training

history = model.fit(
    [x_train, timestamps], y_train,
    epochs=50,
    validation_data=([x_valid, timestamps_valid], y_valid),
    callbacks=[early_stopping]
)

In [None]:
## test data

test_csv = "/kaggle/input/csc-480-project-1-2024fall/test_df.csv"
private_csv = "/kaggle/input/csc-480-project-1-2024fall/test_df_private.csv"

test_df, test_images = load_dataset(test_csv, hmi_folder_path)
private_df, private_images = load_dataset(private_csv, hmi_folder_path)

# normalize
test_images = test_images / 255.
private_images = private_images / 255.

# timestamps
test_timestamps = test_df["timestamp"].values
private_timestamps = private_df["timestamp"].values

# normalize timestamps
test_timestamps = test_timestamps - np.min(test_timestamps)
test_timestamps = test_timestamps / np.max(test_timestamps)

private_timestamps = private_timestamps - np.min(private_timestamps)
private_timestamps = private_timestamps / np.max(private_timestamps)

# MODEL 2

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from itertools import product
from tensorflow import keras
from keras.activations import leaky_relu


def load_dataset(csv_path, hmi_folder_path):
    ## load dataframe and hmi_data
    df = pd.read_csv(csv_path)
    hmi_folder_path = Path(hmi_folder_path)
    hmi_path = df["HMI_img"].apply(lambda x: hmi_folder_path / x)
    hmi_data = np.zeros((len(hmi_path), 224, 224))
    for i in range(len(hmi_path)):
        hmi_data[i] = np.load(hmi_path[i])
    return df, hmi_data


csv_path1 = "/kaggle/input/csc-480-project-1-2024fall/train_df.csv"
hmi_folder_path1 = "/kaggle/input/csc-480-project-1-2024fall/HMI_data/hmi_data/"

df1, hmi_data1 = load_dataset(csv_path, hmi_folder_path)

## build dataset
x_train1 = hmi_data
y_train1 = df[["tsi_pert_0"]].values

In [None]:
test_df1, hmi_data_test1 = load_dataset("/kaggle/input/csc-480-project-1-2024fall/test_df.csv", hmi_folder_path1)
x_test1 = hmi_data_test1
y_test1 = test_df1[["tsi_pert_0"]].values

In [None]:
from keras.layers import LeakyReLU
model1 = keras.models.Sequential()
model1.add(keras.layers.Conv2D(
    filters=32,
    kernel_size=(4, 4),
    activation='relu',
    input_shape=(224, 224, 1)  # Grayscale, channels_last
))

# Add more layers
model1.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))  # Downsample by 2
model1.add(keras.layers.Conv2D(filters=32, kernel_size=(4, 4), activation='relu'))
model1.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model1.add(keras.layers.Conv2D(filters=64, kernel_size=(4, 4), activation='relu'))
model1.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model1.add(keras.layers.Conv2D(filters=128, kernel_size=(4, 4), activation='relu'))
model1.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model1.add(keras.layers.Flatten())  # Flatten into a 1D vector

# Fully connected layers
model1.add(keras.layers.Dense(256))
model1.add(LeakyReLU(negative_slope=0.05))
model1.add(keras.layers.Dense(128))
model1.add(LeakyReLU(negative_slope=0.05))
model1.add(keras.layers.Dense(64))
model1.add(LeakyReLU(negative_slope=0.05))
model1.add(keras.layers.Dense(32))
model1.add(LeakyReLU(negative_slope=0.05))


  # Hidden dense layer
model1.add(keras.layers.Dense(1, activation='linear'))

model1.compile(
    optimizer='adam',
    loss="mean_absolute_error",
    metrics = ["mean_absolute_error"])


# Summary of the model
history1 = model.summary()
model1.fit(
    x_train, y_train,
    validation_split= 0.1,
    epochs=4,
    batch_size=32
)

# META MODEL

In [None]:
# test data
test_csv = "/kaggle/input/csc-480-project-1-2024fall/test_df.csv"
private_csv = "/kaggle/input/csc-480-project-1-2024fall/test_df_private.csv"

test_df, test_images = load_dataset(test_csv, hmi_folder_path)
private_df, private_images = load_dataset(private_csv, hmi_folder_path)

# normalize (only for first model)
test_images_normal = test_images / 255.
private_images_normal = private_images / 255.

# timestamps
test_timestamps = test_df["timestamp"].values
private_timestamps = private_df["timestamp"].values

# normalize timestamps
test_timestamps = test_timestamps - np.min(test_timestamps)
test_timestamps = test_timestamps / np.max(test_timestamps)

private_timestamps = private_timestamps - np.min(private_timestamps)
private_timestamps = private_timestamps / np.max(private_timestamps)


In [None]:
## predictions

# first model
test_predictions_ts = model.predict([test_images_normal, test_timestamps]).flatten()
private_predictions_ts = model.predict([private_images_normal, private_timestamps]).flatten()
bulk1 = model.predict([x_train, timestamps]).flatten()

# second model
test_predictions_model1 = model1.predict(test_images).flatten()
private_predictions_model1 = model1.predict(private_images).flatten()
bulk2 = model1.predict(x_train1).flatten()

# stack
x_train_ensemble = np.vstack([test_predictions_ts, test_predictions_model1]).T
x_test_ensemble = np.vstack([private_predictions_ts, private_predictions_model1]).T
y_train_ensemble = test_df[["tsi_pert_0"]].values

# train
from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.1)  # Alpha controls the regularization strength
lasso_model.fit(x_train_ensemble, y_train_ensemble)

## meta model predictions
test_predictions_ensemble = meta_model.predict(x_train_ensemble)
private_predictions_ensemble = meta_model.predict(x_test_ensemble)


In [None]:
# adding predictions to dfs
test_df['tsi_pert_0'] = test_predictions_ensemble
private_df['tsi_pert_0'] = private_predictions_ensemble

# combining
all_predictions = pd.concat([test_df[['index', 'tsi_pert_0']], private_df[['index', 'tsi_pert_0']]])

# saving
submission_file = "submission1.csv"
all_predictions.to_csv(submission_file, index=False)