In [16]:
import json
import random
import itertools
import sys
import pandas as pd
import tensorflow as tf
import numpy as np
import keras_tuner as kt
import category_encoders

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras import layers
from datetime import datetime

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [23]:
# check that correct virtual env is being used
#print(sys.executable)
print(sys.version)

# list all installed modules
#%pip list

3.12.2 | packaged by Anaconda, Inc. | (main, Feb 27 2024, 17:28:07) [MSC v.1916 64 bit (AMD64)]


In [18]:
def number_of_unique_values_per_column(dataframe: pd.DataFrame) -> None:
    print([(column, dataframe[column].nunique()) for column in dataframe.columns])

# remove all columns that don't contain any important information, since all entries have the same entry
def remove_columns_with_a_single_value(original_dataframe: pd.DataFrame) -> pd.DataFrame:
    return original_dataframe[[column for column in all_sessions.columns if len(all_sessions[column].unique()) > 1]]

def move_column_to_the_front(original_dataframe: pd.DataFrame, column_to_move: str) -> pd.DataFrame:
    first_column = original_dataframe.pop(column_to_move)
    original_dataframe.insert(0, column_to_move, first_column)
    return original_dataframe

def encode_time(original_dataframe: pd.DataFrame, columns_to_encode: list[str]) -> pd.DataFrame:
    new_df = pd.DataFrame()
    
    for index in range(0, len(columns_to_encode)):
        original_dataframe[columns_to_encode[index]] = original_dataframe[columns_to_encode[index]].astype('float64')
        new_df[columns_to_encode[index]] = np.sin(2 * np.pi * original_dataframe[columns_to_encode[index]] / 24.0)
    original_dataframe.update(new_df)

    return original_dataframe


def encode_utc(original_dataframe: pd.DataFrame, columns_to_encode: list[str]) -> pd.DataFrame:
    new_df = pd.DataFrame()
    pattern = "%Y-%m-%dT%H:%M:%S.%fZ"

    for index in range(0, len(columns_to_encode)):
        if index == 0: # time_utc
            pattern = "%Y-%m-%dT%H:%M:%S.%fZ"
        else: # time_local
            pattern = "%Y-%m-%dT%H:%M:%S.%f"

        timestamps = []
        for i in range(len(original_dataframe)):
            time_stamp = datetime.strptime(original_dataframe.loc[i, columns_to_encode[index]], pattern)
            timestamps.append(time_stamp.timestamp())
        new_df[columns_to_encode[index]] = timestamps
    
    original_dataframe.update(new_df)
    
    return original_dataframe


def binary_encode_ids(original_dataframe: pd.DataFrame, columns_to_encode: list[str]) -> pd.DataFrame:
    binary_encoder = category_encoders.BinaryEncoder(cols=columns_to_encode, return_df=True)
    original_dataframe = binary_encoder.fit_transform(original_dataframe)
    return original_dataframe

def encode_boolean(original_dataframe: pd.DataFrame, columns_to_encode: list[str]) -> pd.DataFrame:
    for index in range(0, len(columns_to_encode)):
        original_dataframe[columns_to_encode[index]] = original_dataframe[columns_to_encode[index]].astype('int')
    return original_dataframe

def encode_classification(original_dataframe: pd.DataFrame, column_to_encode: str) -> pd.DataFrame:
    label_encoder = LabelEncoder()
    original_dataframe[column_to_encode] = label_encoder.fit_transform(all_sessions[column_to_encode])
    return original_dataframe

# read json data
with open("datasets\\transfer\\smaller_dataset.json") as file:
    parsed_json = json.load(file)

# build dataframe 
total_amount_of_rows = 0
all_sessions = pd.DataFrame()
for i in pd.json_normalize(parsed_json['traces']):
    total_amount_of_rows += len(pd.json_normalize(parsed_json['traces'][i]))
    single_session = pd.json_normalize(parsed_json['traces'][i])
    all_sessions = pd.concat([all_sessions, single_session], ignore_index=True)

all_sessions = remove_columns_with_a_single_value(all_sessions)

# encode 24h time format
all_sessions = encode_time(all_sessions, ['time_dow', 'time_hod'])

# encode UTC Timestamp
all_sessions = encode_utc(all_sessions, ['time_utc', 'time_local'])

# encode boolean values to int values
all_sessions = encode_boolean(all_sessions, ['device_online'])

# binary encode session_id and device_id
all_sessions = binary_encode_ids(all_sessions, ['session_id', 'device_id'])

# encode the classifcation column 
all_sessions = encode_classification(all_sessions, 'content_id')

# move classification column to the front 
all_sessions = move_column_to_the_front(all_sessions, 'content_id')

# grouped all entries by session_id
grouped = all_sessions.groupby(['session_id_0', 'session_id_1', 'session_id_2', 'session_id_3', 'session_id_4', 'session_id_5', 'session_id_6'])

# number of columns 
number_of_columns = all_sessions.shape[1]

feature_vectors = []
for name, group in grouped:
    iterator, classification_iterator = itertools.tee(group.itertuples(index=False))
    next(classification_iterator, None) 
    
    prev_prev_feature_vector = ['UNKNOWN'] * number_of_columns
    prev_feature_vector = ['UNKNOWN'] * number_of_columns
    feature_vector = []
    vector_to_add = []
    
    for row in iterator:
        # geginning with the second iteration move all feature_vectors back one step
        if prev_feature_vector != []:
            prev_prev_feature_vector = prev_feature_vector
        
        if feature_vector != []:
            prev_feature_vector = feature_vector

        # build current_feature_vector
        feature_vector = [str(element) for element in row]

        # get the classifier of the next event for the feature_vector   
        classification = 'UNKNOWN'
        try:
            next_row = next(classification_iterator)
            classification = next_row[0] 
        except StopIteration:
            classification = 'UNKNOWN'

        # build feature_vector with prev_prev and prev_feature_vector
        vector_to_add = [prev_prev_feature_vector, prev_feature_vector, feature_vector]

        # append feature_vector to all feature_vectors
        feature_vectors.append((vector_to_add, classification))

# shuffle the dataset 
random.shuffle(feature_vectors)

data = [([input_features], label) for input_features, label in feature_vectors]

input_features = [features for features, _ in data]
labels = [label for _, label in data]
concatened_input_features = np.concatenate(input_features)

X_train, X_test, y_train, y_test = train_test_split(input_features, labels, test_size=0.2, random_state=42)


''' 
#print dataframe row by row
for i in range(len(all_sessions)):
    print(all_sessions.loc[i])
    break
'''

' \n#print dataframe row by row\nfor i in range(len(all_sessions)):\n    print(all_sessions.loc[i])\n    break\n'

In [None]:
def model_builder(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten(input_shape=(3, 18)))

    hp_units = hp.Int('units', min_value=16, max_value=512, step=32)
    hp_layers = hp.Int('layers', min_value=2, max_value=7, step=1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    for _ in range(hp_layers):
        model.add(tf.keras.layers.Dense(units=hp_units, activation='relu')) 

    model.add(tf.keras.layers.Dense(highest_value))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
    return model

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=50,
                     directory='with_Unknown',
                     project_name='trained_models\\first_FNN')

tuner.search(X_train, y_train, epochs=50, validation_split=0.2)
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

model = tuner.hypermodel.build(best_hps)

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

hypermodel = tuner.hypermodel.build(best_hps)

hypermodel.fit(X_train, y_train, epochs=best_epoch, batch_size=32, validation_split=0.2)

eval_result = hypermodel.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)