## New Prediction strategy (Task 3)

Import the utility libraries

In [8]:
import pandas as pd
import numpy as np
import random
import statistics
import tensorflow

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, Dense

from bayes_opt import BayesianOptimization

In [9]:
path = ""
#insert the path of the directory in which the CVSs are stored
dataset_df = pd.read_csv(path+'balanced_df.csv')

In [10]:
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label
0,1,5,1,149,1213920000.0,66,1
1,1,5,1,149,1377562000.0,71,1
2,1,5,1,126,1388448000.0,71,1
3,1,5,1,159,1388448000.0,71,1
4,1,5,1,16,1106006000.0,63,1


## 1.1 Create visits

Took account of the date (and the order) of the ”composite event” but not the order of the ”sub-events”.<br>
So we encoded the visits, then we decided to choose a representative for the visit <br>
and we decided to store the number of ”sub-events” that took place during the visit.

In [11]:
dataset_df = dataset_df.groupby(['idcentro', 'idana', 'data', 'sesso', 'etaevento', 'label'], as_index=False).aggregate(list)

In [12]:
dataset_df = dataset_df[['idcentro', 'idana', 'sesso', 'data', 'codici', 'etaevento', 'label']]

A row represents a **visita** in which there is a column **codici** , <br>
with a list of all the codes of the events that happened that day.

In [13]:
dataset_df = dataset_df.rename(columns={'data' : 'datavisita', 'etaevento' : 'etavisita'})
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,datavisita,codici,etavisita,label
0,1,5,1,315532800.0,[161],38,1
1,1,5,1,504921600.0,[161],44,1
2,1,5,1,536457600.0,"[125, 161]",45,1
3,1,5,1,880934400.0,"[161, 96]",55,1
4,1,5,1,959817600.0,"[161, 161, 166]",58,1


In [14]:
dataset_df['numeroeventi'] = dataset_df['codici'].apply(len)

In [15]:
#choose a class representative on the code
dataset_df['rappresentantevisita'] = dataset_df['codici'].apply(lambda x : statistics.mode(x))
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,datavisita,codici,etavisita,label,numeroeventi,rappresentantevisita
0,1,5,1,315532800.0,[161],38,1,1,161
1,1,5,1,504921600.0,[161],44,1,1,161
2,1,5,1,536457600.0,"[125, 161]",45,1,2,125
3,1,5,1,880934400.0,"[161, 96]",55,1,2,161
4,1,5,1,959817600.0,"[161, 161, 166]",58,1,3,161


In [16]:
dataset_df = dataset_df.drop(columns='codici', axis=1)

In [17]:
dataset_df = dataset_df[['idcentro', 'idana', 'sesso', 'datavisita', 'rappresentantevisita', 'numeroeventi', 'etavisita', 'label']]
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,datavisita,rappresentantevisita,numeroeventi,etavisita,label
0,1,5,1,315532800.0,161,1,38,1
1,1,5,1,504921600.0,161,1,44,1
2,1,5,1,536457600.0,125,2,45,1
3,1,5,1,880934400.0,161,2,55,1
4,1,5,1,959817600.0,161,3,58,1


In [18]:
#save the dataframe of task3 in a new file csv
dataset_df.to_csv(path+'t3_df.csv', mode='w', header=True, index=False)

## 1.2 Evaluation of the dataset

Evaluate the dataset over a T-LSTM

In [19]:
features = dataset_df.drop("label", axis=1)
label = dataset_df["label"]

In [20]:
scaler = StandardScaler()
#Calculates normalisation parameters (mean and standard deviation) on training features,
#Transforms training and test features according to these parameters.
features[features.columns[:]] = scaler.fit_transform(features[features.columns[:]])

In [21]:
#20% of the data will be used as a test set, where X are sets of features and y are sets of label with random state equal to 17
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=17)

In [22]:
#transform the training and test data into a structure that can be used as input for an LSTM neural network model
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
print("number of test set:", X_test.shape[0])
print("number of train set:", X_train.shape[0])

number of test set: 467519
number of train set: 1870075


In [23]:
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])))
#adds a dense (fully connected) structure to the model which has only one unit and uses the sigmoid as activation function
model.add(Dense(1, activation="sigmoid"))

In [24]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [25]:
history = model.fit(X_train, y_train, epochs=1, batch_size=16, validation_data=(X_test, y_test))

2023-02-09 19:55:06.604896: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [26]:
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 86.92%


In [27]:
model.save(path+'Model_Task3_LSTM.h5')

### test prediction

In [43]:
t3_model = load_model(path+'Model_Task3_LSTM.h5')
# do this if you wanto to load the previously trained model

In [44]:
y_pred = t3_model.predict(X_test)



In [45]:
y_pred_binary = (y_pred > 0.5).astype(int)

In [46]:
f_score = f1_score(y_test, y_pred_binary)
print("F-Score: {:.2f}".format(f_score))

F-Score: 0.85


In [47]:
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[227719    676]
 [ 60481 178643]]


## 1.3 Bayesian Optimization

Optimisation of neurons and learning rate

In [34]:
def create_model(neurons=32, learning_rate=0.001):
    """
    The function creates a sequential model, then adds an LSTM layer 
    and a dense layer with a single neuron and sigmoid as activation 
    function to it. The layer has 'neurons' number of neurons. The model
    is then compiled using the binary cross-entropy loss function,
    the Adam optimizer and accuracy as metrics.

    Parameters
    ----------
    neurons : int
      represents the number of neurons in layer
    learning_rate : float
      represents the learning rate of optimizer

    Returns
    -------
    model
    """
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model


def hyperparameter_tuning(neurons, learning_rate):
    """
    The function performs hyperparameter tuning on the model created by
    the function 'create_model'. The model is fit to the training data.

    Parameters
    ----------
    neurons : int
      represents the number of neurons in layer
    learning_rate : float
      represents the learning rate of optimizer

    Returns
    -------
    Accuracy of the model 
      given the test set evaluate the accuracy
    """
    neurons = int(neurons)
    model = create_model(neurons, learning_rate)
    model.fit(X_train, y_train)
    return model.evaluate(X_test, y_test, verbose=0)[1]

#perform Bayesian Optimization on 'hyperparameter_tuning' function, the optimization is performed
#with respect to hyperparameter, neurons and learning rate
optimizer = BayesianOptimization(f=hyperparameter_tuning, pbounds={"neurons": (30, 40), "learning_rate": (0.001, 0.005)})
#init_points specifies the number of initial random points to sample, n_iter specifies 
#number of Bayesian optimization iteration to perform after the initial random points have been evaluated.
optimizer.maximize(init_points=5, n_iter=10)
#run 'hyperparameter_tuning' function with different combinations of hyperparameters to find the optimal 
#values that give the best accuracy on the test data.

|   iter    |  target   | learni... |  neurons  |
-------------------------------------------------
| [0m1        [0m | [0m0.8693   [0m | [0m0.004272 [0m | [0m35.67    [0m |
| [95m2        [0m | [95m0.8694   [0m | [95m0.004551 [0m | [95m30.62    [0m |
| [0m3        [0m | [0m0.8693   [0m | [0m0.004712 [0m | [0m31.26    [0m |
| [95m4        [0m | [95m0.8694   [0m | [95m0.001292 [0m | [95m37.47    [0m |
| [0m5        [0m | [0m0.8692   [0m | [0m0.004234 [0m | [0m36.17    [0m |
| [0m6        [0m | [0m0.8693   [0m | [0m0.001071 [0m | [0m37.83    [0m |
| [0m7        [0m | [0m0.8692   [0m | [0m0.00188  [0m | [0m30.6     [0m |
| [0m8        [0m | [0m0.8693   [0m | [0m0.001858 [0m | [0m38.71    [0m |
| [0m9        [0m | [0m0.8693   [0m | [0m0.004692 [0m | [0m33.84    [0m |
| [0m10       [0m | [0m0.8692   [0m | [0m0.003205 [0m | [0m39.02    [0m |
| [0m11       [0m | [0m0.8693   [0m | [0m0.002602 [0m | [0m36.9  

In [35]:
print('best parameters settings: {}'.format(optimizer.max['params']))

best parameters settings: {'learning_rate': 0.0012919876196129017, 'neurons': 37.473873835762035}


In [36]:
params = optimizer.max['params']
neurons = int(params['neurons'])
lr = params['learning_rate']
best_model = create_model(neurons)

In [37]:
best_model.save(path+'Best_Model_nlr.h5') #save model with neurons and learning_rate

### test prediction

In [38]:
best_model = load_model(path+'Best_Model_nlr.h5')
#if you want to load the trained model

In [39]:
y_pred = best_model.predict(X_test)



In [40]:
y_pred_binary = (y_pred > 0.5).astype(int)

In [41]:
f_score = f1_score(y_test, y_pred_binary)
print("F-Score: {:.2f}".format(f_score))

F-Score: 0.51


In [42]:
'''
[[true negative, false positive]
[false negative, true positive]]
'''
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[ 95807 132588]
 [112081 127043]]
