In [None]:
seed_value= 1

import os
os.environ['PYTHONHASHSEED']=str(seed_value)

import random
random.seed(seed_value)

import numpy as np
np.random.seed(seed_value)

import tensorflow as tf
tf.random.set_seed(seed_value)# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

from keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from keras.layers import Input, Reshape, Embedding, Concatenate, Dropout, Dense
from keras.models import Model
from keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv('cleaned_data/data_with_features.csv')
df = df.drop(df.columns[0], axis = 1)

In [None]:
#add right target
df_lst = []

for user in np.unique(df['EthicaID']): 
    df_single_user = df.loc[df['EthicaID'] == user]     
    real_target = df_single_user[['target']][1:len(df_single_user)].rename({'target': 'real_target'}, axis = 1)
    df_single_user = pd.concat([df_single_user.reset_index(drop = True), real_target.reset_index(drop = True)], axis =1)    
    df_lst.append(df_single_user)

In [None]:
df_full = pd.concat(df_lst, axis = 0)

In [None]:
df_full.tail()

In [None]:
df_full.isna().sum()

In [None]:
df_full = df_full.dropna()

In [None]:
df_full.columns

In [None]:
#select feature subset
df_model = df_full[['use_duration', 'duration_ongoing_session',
       'time_to_next_app', 'age_category', 'sex',
       'notification', 'battery', 'category', 'hours',
       'weekday', 'geohash', 'location_cluster', 'target']]

In [None]:
cat_col = ['age_category', 'sex', 'notification', 'battery', 'category', 'hours',
           'weekday', 'geohash', 'location_cluster', 'target']
num_col_lst =  ['use_duration', 'duration_ongoing_session', 'time_to_next_app']
target = 'real target'

In [None]:
#make encoders
encoders = {}  
for v in cat_col:  
    le = LabelEncoder()
    le.fit(df_model[v].values)
    encoders[v] = le
    df_model.loc[:, v] = le.transform(df_model[v].values)
    print('{0}: {1}'.format(v, le.classes_))

In [None]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(df_model, df_full['real_target'], test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [None]:
X_train = X_train[cat_col + num_col_lst].copy()  
X_val = X_valid[cat_col + num_col_lst].copy()
X_test = X_test[cat_col + num_col_lst].copy()

In [None]:
y = y_train.astype(str).values
y_val = y_valid.astype(str).values
y_test = y_test.astype(str).values
y = y.reshape(len(y), 1)
y_val = y_val.reshape(len(y_val), 1)
y_test = y_test.reshape(len(y_test), 1)

In [None]:
def prepare_targets(y_train, y_valid, y_test):
    le=MultiLabelBinarizer()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_valid_enc = le.transform(y_valid)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_valid_enc, y_test_enc

In [None]:
#encode targets
y, y_val, y_test =  prepare_targets(y, y_val, y_test)

In [None]:
#prepare features for transformation
for v in cat_col:  
    X_train[v] = X_train[v].astype('int').astype('category').cat.as_ordered()
    X_val[v] = X_val[v].astype('int').astype('category').cat.as_ordered()
    X_test[v] = X_test[v].astype('int').astype('category').cat.as_ordered()
for v in num_col_lst:  
    X_train[v] = X_train[v].astype('float32')
    X_val[v] = X_val[v].astype('float32')
    X_test[v] = X_test[v].astype('float32')

In [None]:
df_cat = df_model[['age_category', 'sex', 'notification', 'battery', 'category', 'hours',
           'weekday', 'geohash', 'location_cluster', 'target']].astype(('category'))

In [None]:
#define embeddings size
cat_sizes = [(c, len(df_cat[c].cat.categories)) for c in cat_col]  
cat_sizes
embedding_sizes = [(c, min(50, (c + 1) // 2)) for _, c in cat_sizes]  
embedding_sizes

In [None]:
X_array = []  
X_val_array = []
X_test_array = []

for i, v in enumerate(cat_col):  
    X_array.append(X_train.iloc[:, i])
    X_val_array.append(X_val.iloc[:, i])
    X_test_array.append(X_test.iloc[:, i])

X_array.append(X_train.iloc[:, len(cat_col):])  
X_val_array.append(X_val.iloc[:, len(cat_col):])
X_test_array.append(X_test.iloc[:, len(cat_col):])


len(X_array), len(X_val_array), len(X_test_array)

In [None]:
#define model structure
def fit_model(neurons):
    inputs = []
    embed_layers = []
    for (c, (in_size, out_size)) in zip(cat_col, embedding_sizes):
        i = Input(shape=(1, )) # dim hinzufügen
        o = Embedding(in_size, out_size, name=c)(i)
        o = Reshape(target_shape=(out_size,))(o)
        inputs.append(i)
        embed_layers.append(o)
    embed = Concatenate()(embed_layers)
    cont_input = Input(shape=(len(num_col_lst),))
    inputs.append(cont_input)
    x = Concatenate()([embed, cont_input])
    dense = Dense(neurons, activation= 'sigmoid', kernel_initializer= 'he_normal')(x)
    d = Dropout(0.2)(dense)

    output = Dense(3, activation= 'softmax')(d)
    model = Model(inputs=inputs, outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience = 5, restore_best_weights=True)

In [None]:
import matplotlib.pyplot as plt

In [None]:
#hyperparameter optimization loop
from datetime import datetime
start_time = datetime.now()
for num_neurons in [256]: 
    for batch_size in [256]:
        model = fit_model(num_neurons)
        history = model.fit(X_array, y, epochs=50, validation_data = (X_val_array, y_val), callbacks  = [es], verbose = 1, 
                           batch_size = batch_size)
        _, accuracy = model.evaluate(X_val_array, y_val, verbose=1)
        with open('model/Baseline/Baseline_all_features_excl_special+Dropout/Baseline_all_features_excl_special+Dropout.txt', 'a') as fp:
            fp.writelines(['Number of Neurons: ' + str(num_neurons) +
                           '\n', 'Batch Size: ' + str(batch_size) + '\n', 'seed1\n',
                           'Accuracy: ' + str(accuracy) + '\n\n'])
        model.save('model/Baseline/Baseline_all_features_excl_special+Dropout/model_neurons_'+ str(num_neurons) + 'batch_size' +
                   str(batch_size)+ '_seed1.h5')
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['accuracy'], color = 'dodgerblue')
        plt.plot(history.history['val_accuracy'], color = 'rebeccapurple')
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.xticks(range(0, len(history.history['accuracy'])+1, 2))
        plt.legend(['train', 'valid'], loc='upper left')
        plt.savefig('model/Baseline/Baseline_all_features_excl_special+Dropout/model_neurons_'+ str(num_neurons) + #
                    'batch_size' + str(batch_size)+ '_seed1.png')
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
import keras

In [None]:
#insert best hyperparameters to optain predictive accuracy
model = keras.models.load_model('model/Baseline/Baseline_all_features_excl_special+Dropout/model_neurons_256batch_size256_seed1.h5')
_, accuracy_test = model.evaluate(X_test_array, y_test, verbose=1)
print(accuracy_test)

_, accuracy_valid = model.evaluate(X_val_array, y_val, verbose=1)
print(accuracy_valid)

_, accuracy_train = model.evaluate(X_array, y, verbose=1)
print(accuracy_train)

with open('model/Baseline/Baseline_all_features_excl_special+Dropout/Baseline_all_features_excl_special+Dropout.txt', 'a') as fp:
    fp.writelines(['Best model accuracy on test set: ' + str(accuracy_test) + '\n\n', 
                   'Best model accuracy on valid set: ' + str(accuracy_valid) + '\n\n', 
                   'Best model accuracy on train set: ' + str(accuracy_train) + '\n\n'])

In [None]:
#wirte training time to file
with open('model/Baseline/Baseline_all_features_excl_special+Dropout/Baseline_all_features_excl_special+Dropout.txt', 'a') as fp:
    fp.writelines(['Time needed for training:', str(end_time - start_time)])