In [1]:
import json
import random
import itertools
import sys
import pandas as pd
import tensorflow as tf
import numpy as np
import keras_tuner as kt
import category_encoders
import hashlib
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
from datetime import datetime
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.feature_extraction import FeatureHasher
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True) 

In [None]:
# check that correct virtual env is being used
#print(sys.executable)
print(sys.version)

# list all installed modules
#%pip list

In [2]:
# remove all columns that don't contain any important information, since all entries have the same entry
def remove_columns_with_a_single_value(original_dataframe: pd.DataFrame) -> pd.DataFrame:
    return original_dataframe[[column for column in original_dataframe.columns if len(original_dataframe[column].unique()) > 1]]

def encode_time(original_dataframe: pd.DataFrame, columns_to_encode: list[str], period: int) -> pd.DataFrame:
    for column in columns_to_encode:
        original_dataframe[column] = original_dataframe[column].astype('float64')
        original_dataframe[column + '_sin'] = np.sin(2 * np.pi * original_dataframe[column] / period)
        original_dataframe[column + '_cos'] = np.cos(2 * np.pi * original_dataframe[column] / period)
    original_dataframe = original_dataframe.drop(columns=columns_to_encode)
    return original_dataframe

def encode_time_utc(original_dataframe: pd.DataFrame, column_to_encode: str, pattern: str) -> pd.DataFrame:
    timestamps = []
    for i in range(len(original_dataframe)):
        time_stamp = original_dataframe.loc[i, column_to_encode]
        if time_stamp.__contains__("."):
            pattern = '%Y-%m-%dT%H:%M:%S.%fZ'
        else: 
            pattern = '%Y-%m-%dT%H:%M:%SZ'
        time_stamp = datetime.strptime(original_dataframe.loc[i, column_to_encode], pattern)
        timestamps.append(time_stamp.timestamp())

    original_dataframe[column_to_encode] = timestamps
    return original_dataframe

def encode_weather_time_utc(original_dataframe: pd.DataFrame, column_to_encode: str, pattern: str) -> pd.DataFrame:
    timestamps = []
    for i in range(len(original_dataframe)):
        time_stamp = original_dataframe.loc[i, column_to_encode]
        time_stamp = datetime.strptime(original_dataframe.loc[i, column_to_encode], pattern)
        timestamps.append(time_stamp.timestamp())

    original_dataframe[column_to_encode] = timestamps
    return original_dataframe

def encode_time_local(original_dataframe: pd.DataFrame, column_to_encode: str, pattern: str) -> pd.DataFrame:
    timestamps = []
    for i in range(len(original_dataframe)):
        time_stamp = original_dataframe.loc[i, column_to_encode]
        if time_stamp.__contains__("."):
            pattern = '%Y-%m-%dT%H:%M:%S.%f'
        else: 
            pattern = '%Y-%m-%dT%H:%M:%S'
        time_stamp = datetime.strptime(original_dataframe.loc[i, column_to_encode], pattern)
        timestamps.append(time_stamp.timestamp())

    original_dataframe[column_to_encode] = timestamps
    return original_dataframe


def number_of_unique_values_per_column(dataframe: pd.DataFrame) -> list[(str, int)]:
    return [(column, dataframe[column].nunique()) for column in dataframe.columns]

def onehot_encode_column(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    encoder = OneHotEncoder()
    encoded_column = encoder.fit_transform(original_dataframe[[column_to_encode]])
    encoded_column_dataframe = pd.DataFrame(encoded_column.toarray(), columns=encoder.get_feature_names_out([column_to_encode]))
    original_dataframe = pd.concat([original_dataframe, encoded_column_dataframe], axis=1)
    if column_to_encode != "session_id":
        original_dataframe = original_dataframe.drop(columns=[column_to_encode])
    return original_dataframe

def encode_classification(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    label_encoder = LabelEncoder()
    original_dataframe[column_to_encode] = label_encoder.fit_transform(original_dataframe[column_to_encode])
    return original_dataframe

def encode_boolean(original_dataframe: pd.DataFrame, columns_to_encode: list[str]) -> pd.DataFrame:
    for index in range(0, len(columns_to_encode)):
        original_dataframe[columns_to_encode[index]] = original_dataframe[columns_to_encode[index]].astype('int')
    return original_dataframe

def category_encode_column(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    binary_encoder = category_encoders.BinaryEncoder(cols=column_to_encode)
    original_dataframe = binary_encoder.fit_transform(original_dataframe)
    return original_dataframe

def encode_str_to_enum(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    encoding_dict = dict()
    encoded_value = 0
    for index in range(len(original_dataframe)):
        value = original_dataframe.iloc[index][column_to_encode]
        if value in encoding_dict:
            original_dataframe.at[index, column_to_encode] = encoding_dict[value]
        else: 
            encoding_dict[value] = encoded_value
            original_dataframe.at[index, column_to_encode] = encoded_value
            encoded_value += 1
    return original_dataframe

def move_column_to_the_front(original_dataframe: pd.DataFrame, column_to_move: str) -> pd.DataFrame:
    first_column = original_dataframe.pop(column_to_move)
    original_dataframe.insert(0, column_to_move, first_column)
    return original_dataframe

def get_weather_data(original_dataframe: pd.DataFrame, column_to_encode: str, json_subarray_name: str) -> pd.DataFrame:
    weather_data_list = []
    
    for _, row in original_dataframe.iterrows():
        weather_day_id = row[column_to_encode]
        weather_day_data = parsed_json[json_subarray_name][str(weather_day_id)]
        weather_day_data = {f"{column_to_encode} {k}": v for k, v in weather_day_data.items() if k != 'id'}
        weather_data_list.append(weather_day_data)
    weather_data_df = pd.DataFrame(weather_data_list)
    weather_data_df = remove_columns_with_a_single_value(weather_data_df)
    
    if column_to_encode == "weather_day_id":
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} sun_set', "%Y-%m-%dT%H:%M:%S%z")
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} sun_rise', "%Y-%m-%dT%H:%M:%S%z")
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} created_at', "%Y-%m-%dT%H:%M:%S.%f%z")
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} calculated_at', "%Y-%m-%dT%H:%M:%S%z")

    elif column_to_encode == "weather_hour_id":
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} created_at', "%Y-%m-%dT%H:%M:%S.%f%z")
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} calculated_at', "%Y-%m-%dT%H:%M:%S%z")
        weather_data_df = encode_weather_time_utc(weather_data_df, f'{column_to_encode} forecast_time', "%Y-%m-%dT%H:%M:%S%z")

    original_dataframe = pd.concat([original_dataframe, weather_data_df], axis=1)
    return original_dataframe

def get_content_map(original_dataframe: pd.DataFrame, column_to_encode: str, json_subarray_name: str) -> pd.DataFrame:
    content_data_list = []
    
    for _, row in original_dataframe.iterrows():
        content_id = row[column_to_encode]
        content_id_data = parsed_json[json_subarray_name][str(content_id)]
        content_id_data = {f"{column_to_encode} {k}": v for k, v in content_id_data.items()}
        content_data_list.append(content_id_data)
    content_data_df = pd.DataFrame(content_data_list)
    original_dataframe = pd.concat([original_dataframe, content_data_df], axis=1)
    return original_dataframe

def encode_date(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    for _, row in original_dataframe.iterrows():
        date_object = datetime.strptime(row[column_to_encode], "%Y-%m-%d")
        original_dataframe[column_to_encode] = date_object.toordinal()
    return original_dataframe

def encode_weather_enums(original_dataframe: pd.DataFrame, json_subarray_name: str, json_key: str) -> pd.DataFrame:
    weather_day_data = parsed_json[json_subarray_name][json_key]
    for index in range(0, len(original_dataframe)):
        for key, value in weather_day_data.items():
            if original_dataframe.loc[index, f'weather_hour_id {json_key}'] == value:
                original_dataframe.loc[index, f'weather_hour_id {json_key}'] = key     
    return original_dataframe



In [3]:
# read json data
with open("datasets\\transfer\\larger_dataset.json") as file:
    parsed_json = json.load(file)

# build dataframe 
total_amount_of_rows = 0
all_sessions = pd.DataFrame()
for i in pd.json_normalize(parsed_json['traces']):
    total_amount_of_rows += len(pd.json_normalize(parsed_json['traces'][i]))
    single_session = pd.json_normalize(parsed_json['traces'][i])
    single_session_filtered = single_session.dropna(how='all', axis=1)
    #dropped_columns = set(single_session.columns) - set(single_session_filtered.columns)
    #print("Dropped columns:", dropped_columns)
    all_sessions = pd.concat([all_sessions, single_session_filtered], ignore_index=True)

all_sessions = all_sessions.drop(columns=['weather_future_day_id', 'weather_future_hour_id'])
all_sessions = all_sessions.drop(columns=['event_data.for_date', 'event_data.for_date_days_in_future'])

all_sessions = all_sessions.infer_objects(copy=False)
null_columns = all_sessions.columns[all_sessions.isnull().any()]
print(null_columns)

null_columns = all_sessions.columns[all_sessions.isnull().any()]
print(null_columns)
for column in null_columns:
    all_sessions[column] = all_sessions[column].fillna(all_sessions[column].mode()[0])

null_columns = all_sessions.columns[all_sessions.isnull().any()]

all_sessions = encode_time(all_sessions, ['time_hod'], 24)
all_sessions = encode_time(all_sessions, ['time_dow'], 7)
all_sessions = encode_time_local(all_sessions, 'time_local', '%Y-%m-%dT%H:%M:%S.%f')
all_sessions = encode_time_utc(all_sessions, 'time_utc', '%Y-%m-%dT%H:%M:%SZ')
all_sessions = encode_boolean(all_sessions, ['device_online'])
all_sessions = encode_str_to_enum(all_sessions, 'content_portal')

# get more in-depth weather data
all_sessions = get_weather_data(all_sessions, 'weather_day_id', 'weather_day_map')
all_sessions = get_weather_data(all_sessions, 'weather_hour_id', 'weather_hour_map')

# too many null values
all_sessions = all_sessions.drop(columns=['weather_day_id moon_set', 'weather_day_id moon_rise'])
all_sessions = all_sessions.drop(columns=['weather_day_id', 'weather_hour_id'])

# encode non int values from the weather data
all_sessions = encode_date(all_sessions, 'weather_day_id forecast_date')
all_sessions = encode_weather_enums(all_sessions, 'weather_enums', 'wind_direction')
all_sessions = encode_weather_enums(all_sessions, 'weather_enums', 'thunderstorm_prob')

ohe_features = ['content_portal', 'device_online', 'device_class', 'device_orientation', 'oha_language_iso2', 'oha_layout', 'weather_hour_id thunderstorm_prob', 'weather_day_id thunderstorm_prob']
numerical_features = ['time_utc', 'time_local', 'device_height_px', 'device_width_px']
numerical_features2 = ['weather_day_id sun_set', 'weather_day_id sun_rise', 'weather_day_id created_at', 'weather_day_id sunshine_h', 'weather_day_id temp_max_c', 'weather_day_id temp_min_c', 'weather_day_id calculated_at', 'weather_day_id forecast_date', 'weather_day_id prec_prob_pct', 'weather_day_id prec_rain_mm_h', 'weather_day_id prec_snow_mm_h', 'weather_day_id wind_speed_kmh', 'weather_day_id prec_total_mm_h', 'weather_day_id temp_felt_max_c', 'weather_day_id temp_felt_min_c', 'weather_day_id humidity_mean_pct', 'weather_day_id wind_speed_max_kmh', 'weather_day_id cloud_cover_max_pct', 'weather_day_id cloud_cover_min_pct', 'weather_day_id cloud_cover_mean_pct', 'weather_hour_id temp_c', 'weather_hour_id created_at', 'weather_hour_id sunshine_h', 'weather_hour_id temp_felt_c', 'weather_hour_id humidity_pct', 'weather_hour_id calculated_at', 'weather_hour_id forecast_time', 'weather_hour_id prec_rain_mm_h', 'weather_hour_id prec_snow_mm_h', 'weather_hour_id wind_speed_kmh', 'weather_hour_id cloud_cover_pct', 'weather_hour_id prec_total_mm_h', 'weather_hour_id forecast_distance_h']

binary_features = ['device_country_iso2', 'device_language_iso2', 'event_type', 'device_platform', 'weather_hour_id wind_direction', 'weather_day_id wind_direction', 'weather_day_id moon_phase']
embedded_features = ['content_id', 'device_id', 'session_id']

for column in embedded_features:
    all_sessions = encode_classification(all_sessions, column)
    all_sessions = move_column_to_the_front(all_sessions, column)

scaler = StandardScaler()
for column in numerical_features:
    all_sessions[column] = scaler.fit_transform(np.array(all_sessions[column]).reshape(-1, 1))
for column in numerical_features2:
    all_sessions[column] = scaler.fit_transform(np.array(all_sessions[column]).reshape(-1, 1))

one_hot_encoder = OneHotEncoder()
for column in ohe_features:
    all_sessions = onehot_encode_column(all_sessions, column)

for column in binary_features:
    #all_sessions = onehot_encode_column(all_sessions, column)
    all_sessions = category_encode_column(all_sessions, column)

print(all_sessions.isnull().sum().sum())

all_sessions[f'prev_content_id'] = all_sessions['content_id'].shift(1)
all_sessions[f'prev_prev_content_id'] = all_sessions['content_id'].shift(2)

all_sessions = move_column_to_the_front(all_sessions, 'prev_prev_content_id')
all_sessions = move_column_to_the_front(all_sessions, 'prev_content_id')
all_sessions.fillna({'prev_content_id': 104}, inplace=True)
all_sessions.fillna({'prev_prev_content_id': 104}, inplace=True)

'''
print(all_sessions.iloc[0])
all_sessions.fillna({'prev_prev_event': 39}, inplace=True)
all_sessions = move_column_to_the_front(all_sessions, 'prev_prev_event')
'''

unique_classifications = all_sessions['content_id'].nunique()

# build features vectors
feature_vectors = []
for i in range(0, len(all_sessions)):
    if i < len(all_sessions) - 1:
        if all_sessions.iloc[i]['session_id'] == all_sessions.iloc[i+1]['session_id']:
            feature_vector = (all_sessions.iloc[i], all_sessions.iloc[i+1]['content_id']) 
            feature_vectors.append(feature_vector)

# shuffle dataset
#random.shuffle(feature_vectors)

# build train / test sets
input_features = []
classification_labels = []

for input_feature, classification in feature_vectors:
    input_features.append(input_feature)
    classification_labels.append(classification)

input_features_array = np.array(input_features)
classification_labels_array = np.array(classification_labels)

print(len(input_features_array))

ros = RandomOverSampler(random_state=42)
input_resampled, classification_resampled = ros.fit_resample(input_features_array, classification_labels_array)

X_train, X_test, y_train, y_test = train_test_split(input_resampled, classification_resampled, test_size=0.2, random_state=42)


print(X_train.shape[0] + X_test.shape[0])
print(number_of_unique_values_per_column(all_sessions))
print(unique_classifications)

Index(['device_country_iso2'], dtype='object')
Index(['device_country_iso2'], dtype='object')
0
337
2790
[('prev_content_id', 105), ('prev_prev_content_id', 105), ('session_id', 76), ('device_id', 41), ('content_id', 104), ('time_utc', 404), ('event_type_0', 2), ('event_type_1', 2), ('event_type_2', 2), ('time_local', 404), ('device_platform_0', 2), ('device_platform_1', 2), ('device_platform_2', 2), ('device_width_px', 18), ('device_height_px', 26), ('device_country_iso2_0', 2), ('device_country_iso2_1', 2), ('device_country_iso2_2', 2), ('device_language_iso2_0', 2), ('device_language_iso2_1', 2), ('device_language_iso2_2', 2), ('time_hod_sin', 19), ('time_hod_cos', 18), ('time_dow_sin', 7), ('time_dow_cos', 7), ('weather_day_id sun_set', 35), ('weather_day_id sun_rise', 35), ('weather_day_id created_at', 35), ('weather_day_id moon_phase_0', 2), ('weather_day_id moon_phase_1', 2), ('weather_day_id moon_phase_2', 2), ('weather_day_id moon_phase_3', 2), ('weather_day_id sunshine_h', 26

In [5]:
print(X_train.shape)

X_train_prev_event, X_train = X_train[:, :1], X_train[:, 1:]
X_train_prev_prev_event, X_train = X_train[:, :1], X_train[:, 1:]
X_train_session_id, X_train = X_train[:, :1], X_train[:, 1:]
X_train_device_id, X_train = X_train[:, :1], X_train[:, 1:]
X_train_content_id, X_train = X_train[:, :1], X_train[:, 1:]

X_train_prev_event_input_dim = len(set(X_train_prev_event.flatten()))
X_train_prev_prev_event_input_dim = len(set(X_train_prev_event.flatten()))

print(X_train.shape)

X_test_prev_event, X_test = X_test[:, :1], X_test[:, 1:]
X_test_prev_prev_event, X_test = X_test[:, :1], X_test[:, 1:]
X_test_session_id, X_test = X_test[:, :1], X_test[:, 1:]
X_test_device_id, X_test = X_test[:, :1], X_test[:, 1:]
X_test_content_id, X_test = X_test[:, :1], X_test[:, 1:]

(2232, 84)
(2232, 79)


In [6]:
def model_builder(hp):
    hp_output1 = hp.Int('output_dim_prev_event', min_value=24, max_value=44, step=2)
    hp_output2 = hp.Int('output_dim_prev_prev_event', min_value=24, max_value=44, step=2)
    hp_output3 = hp.Int('output_dim_session_id', min_value=24, max_value=44, step=2)
    hp_output4 = hp.Int('output_dim_device_id', min_value=24, max_value=44, step=2)
    hp_output5 = hp.Int('output_dim_output', min_value=24, max_value=44, step=2)

    embedding_input_prev_event = tf.keras.layers.Input(shape=(1,), dtype='int32') 
    embedding_layer_prev_event = tf.keras.layers.Embedding(input_dim=X_train_prev_event_input_dim, output_dim=hp_output1)(embedding_input_prev_event)
    flattened_prev_event = tf.keras.layers.Flatten()(embedding_layer_prev_event)

    embedding_input_prev_prev_event = tf.keras.layers.Input(shape=(1,), dtype='int32') 
    embedding_layer_prev_prev_event = tf.keras.layers.Embedding(input_dim=X_train_prev_prev_event_input_dim, output_dim=hp_output2)(embedding_input_prev_prev_event)
    flattened_prev_prev_event = tf.keras.layers.Flatten()(embedding_layer_prev_prev_event)
    
    embedding_input_session_id = tf.keras.layers.Input(shape=(1,), dtype='int32') 
    embedding_layer_session_id = tf.keras.layers.Embedding(input_dim=76, output_dim=hp_output3)(embedding_input_session_id)
    flattened_session_id = tf.keras.layers.Flatten()(embedding_layer_session_id)

    embedding_input_device_id = tf.keras.layers.Input(shape=(1,), dtype='int32') 
    embedding_layer_device_id = tf.keras.layers.Embedding(input_dim=41, output_dim=hp_output4)(embedding_input_device_id)
    flattened_device_id = tf.keras.layers.Flatten()(embedding_layer_device_id)

    concatenated_embeddings = tf.keras.layers.Concatenate()([flattened_prev_event, flattened_prev_prev_event, flattened_session_id, flattened_device_id])
    #concatenated_embeddings = tf.keras.layers.Concatenate()([flattened_prev_event, flattened_session_id, flattened_device_id])
    #concatenated_embeddings = tf.keras.layers.Concatenate()([flattened_session_id, flattened_device_id])

    flattened_input = tf.keras.layers.Input(shape=(X_train.shape[1],))
    flattened = tf.keras.layers.Flatten()(flattened_input)
    concatenated = tf.keras.layers.Concatenate()([concatenated_embeddings, flattened])
    
    hp_units = hp.Int('units', min_value=16, max_value=1024, step=32)
    hp_layers = hp.Int('layers', min_value=2, max_value=5, step=1)
    hp_learning_rate = hp.Choice('learning_rate', values=[0.1, 0.01])
    hp_optimizer = hp.Choice('optimizer', values=['adam'])
    hp_regularization = hp.Choice('regularization', values=['l1', 'l2', 'l1_l2'])
    hp_lambda = hp.Float('lambda', min_value=0.5, max_value=1.0, step=0.01)
    
    for i in range(hp_layers):
        if hp_regularization == 'l1':
            regularizer = tf.keras.regularizers.l1(hp_lambda)
        elif hp_regularization == 'l2':
            regularizer = tf.keras.regularizers.l2(hp_lambda)
        else: # l1_l2
            regularizer = tf.keras.regularizers.l1_l2(l1=hp_lambda, l2=hp_lambda)
        
        concatenated = tf.keras.layers.Dense(units=hp_units, activation='relu', kernel_regularizer=regularizer)(concatenated)
        concatenated = tf.keras.layers.BatchNormalization()(concatenated)
        concatenated = tf.keras.layers.Dropout(0.3)(concatenated)    
     
    
    
    output_embedding_input = tf.keras.layers.Input(shape=(1,), dtype='int32')
    output_embedding_layer = tf.keras.layers.Embedding(input_dim=unique_classifications, output_dim=hp_output5)(output_embedding_input)
    flattened_output_embedding = tf.keras.layers.Flatten()(output_embedding_layer)
    
    final_concatenated = tf.keras.layers.Concatenate()([concatenated, flattened_output_embedding])
    
    output = tf.keras.layers.Dense(unique_classifications, activation='softmax')(final_concatenated)

    model = tf.keras.models.Model(inputs=[embedding_input_prev_event, embedding_input_prev_prev_event, embedding_input_session_id, embedding_input_device_id, flattened_input, output_embedding_input], outputs=output)
    #model = tf.keras.models.Model(inputs=[embedding_input_prev_event, embedding_input_session_id, embedding_input_device_id, flattened_input, output_embedding_input], outputs=output)
    #model = tf.keras.models.Model(inputs=[embedding_input_session_id, embedding_input_device_id, flattened_input, output_embedding_input], outputs=output)
        
    optimizer = hp_optimizer
    if optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=hp_learning_rate)
    else:
        optimizer = tf.keras.optimizers.SGD(learning_rate=hp_learning_rate)

    model.compile(optimizer=optimizer,
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    return model

best_performance = 0
best_model_path = r'trained_models\best_model\best_model.keras'

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

performance_metrics = []

for train_index, val_index in kfold.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    X_train_prev_event_fold, X_val_prev_event_fold = X_train_prev_event[train_index], X_train_prev_event[val_index]
    X_train_prev_prev_event_fold, X_val_prev_prev_event_fold = X_train_prev_prev_event[train_index], X_train_prev_prev_event[val_index]
    X_train_session_fold, X_val_session_fold = X_train_session_id[train_index], X_train_session_id[val_index]
    X_train_device_fold, X_val_device_fold = X_train_device_id[train_index], X_train_device_id[val_index]
    X_train_output_fold, X_val_output_fold = X_train_content_id[train_index], X_train_content_id[val_index]
    
    X_train_fold_inputs = [X_train_prev_event_fold, X_train_prev_prev_event_fold, X_train_session_fold, X_train_device_fold, X_train_fold, X_train_output_fold]
    X_val_fold_inputs = [X_val_prev_event_fold, X_val_prev_prev_event_fold, X_val_session_fold, X_val_device_fold, X_val_fold, X_val_output_fold]

    #X_train_fold_inputs = [X_train_prev_event_fold, X_train_session_fold, X_train_device_fold, X_train_fold, X_train_output_fold]
    #X_val_fold_inputs = [X_val_prev_event_fold, X_val_session_fold, X_val_device_fold, X_val_fold, X_val_output_fold]

    #X_train_fold_inputs = [X_train_session_fold, X_train_device_fold, X_train_fold, X_train_output_fold]
    #X_val_fold_inputs = [X_val_session_fold, X_val_device_fold, X_val_fold, X_val_output_fold]


    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    tuner = kt.BayesianOptimization(
        model_builder,
        objective='val_accuracy', 
        max_trials=25,
        executions_per_trial=3,
        directory='trained_models',
        project_name='DI_9' 
    )

    tuner.search(X_train_fold_inputs, y_train_fold, epochs=50, validation_data=(X_val_fold_inputs, y_val_fold))
    
    best_model = tuner.get_best_models(num_models=1)[0]

    eval_result = best_model.evaluate(X_val_fold_inputs, y_val_fold)
    print("[test loss, test accuracy]:", eval_result)

    performance_metrics.append(eval_result[1])

    if eval_result[1] > best_performance:
        best_performance = eval_result[1]
        best_model.save(best_model_path)

X_train_inputs = [X_train_prev_event, X_train_prev_prev_event, X_train_session_id, X_train_device_id, X_train,  X_train_content_id]
X_test_inputs = [X_test_prev_event, X_test_prev_prev_event, X_test_session_id, X_test_device_id, X_test, X_test_content_id]

#X_train_inputs = [X_train_prev_event, X_train_session_id, X_train_device_id, X_train,  X_train_content_id]
#X_test_inputs = [X_test_prev_event, X_test_session_id, X_test_device_id, X_test, X_test_content_id]

#X_train_inputs = [X_train_session_id, X_train_device_id, X_train,  X_train_content_id]
#X_test_inputs = [X_test_session_id, X_test_device_id, X_test, X_test_content_id]

average_performance = np.mean(performance_metrics)
print(f"Average validation accuracy across all folds: {average_performance}")

best_model = keras.models.load_model(best_model_path)
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

history_test = []
for i in range(5):
    history = best_model.fit(X_train_inputs, y_train, epochs=50, batch_size=32, validation_split=0.2)

    test_loss, test_accuracy = best_model.evaluate(X_test_inputs, y_test)
    history_test.append((test_loss, test_accuracy))
    print(f"Test loss: {test_loss}, Test accuracy: {test_accuracy}")


print(history_test)


Trial 25 Complete [00h 18m 14s]
val_accuracy: 0.5645041267077128

Best val_accuracy So Far: 0.8970917463302612
Total elapsed time: 12h 54m 20s


  trackable.load_own_variables(weights_store.get(inner_path))


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9207 - loss: 4.1239
[test loss, test accuracy]: [4.26212739944458, 0.899328887462616]
Reloading Tuner from trained_models\DI_9\tuner0.json
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9094 - loss: 4.1005
[test loss, test accuracy]: [4.064127445220947, 0.9082773923873901]
Reloading Tuner from trained_models\DI_9\tuner0.json
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9438 - loss: 3.9304
[test loss, test accuracy]: [3.9562718868255615, 0.9237667918205261]
Reloading Tuner from trained_models\DI_9\tuner0.json
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9303 - loss: 3.9706
[test loss, test accuracy]: [3.9952571392059326, 0.9237667918205261]
Reloading Tuner from trained_models\DI_9\tuner0.json
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 

  trackable.load_own_variables(weights_store.get(inner_path))


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step - accuracy: 0.8128 - loss: 7.8420 - val_accuracy: 0.8658 - val_loss: 4.9667
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.8643 - loss: 4.5161 - val_accuracy: 0.8568 - val_loss: 4.4842
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.8710 - loss: 4.3052 - val_accuracy: 0.8591 - val_loss: 4.8803
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8709 - loss: 4.4991 - val_accuracy: 0.8591 - val_loss: 5.3552
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.8646 - loss: 4.9709 - val_accuracy: 0.8389 - val_loss: 5.5139
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8704 - loss: 5.1409 - val_accuracy: 0.8501 - val_loss: 5.1927
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━

In [None]:
def plot_history(history):
    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))

    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.legend(loc='lower right')

    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_ylabel('Loss')
    ax2.set_xlabel('Epoch')

    plt.show()


print("val_accuracy: ", history.history['val_accuracy'])
print("accuracy: ", history.history['accuracy'])
print("val_loss: ", history.history['val_loss'])
print("loss: ", history.history['loss'])

plot_history(history)



In [None]:
for name, value in best_hps.values.items():
    print(f"{name}: {value}")


tuner.results_summary(num_trials=10)