In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
%cd /gdrive/My Drive/Tesi Notebooks

In [None]:
#Data quality libraries
from imblearn.under_sampling import RandomUnderSampler

import tensorflow as tf
import numpy as np
import os
import random
import json
import datetime
import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import absolute
from numpy import mean
from numpy import std
import warnings
import logging
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

tfk = tf.keras
tfkl = tf.keras.layers
print(tf.__version__)

In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
def build_ResNet(input_shape, classes):
    n_feature_maps = 64
    input_layer = tfkl.Input(input_shape)

    # BLOCK 1

    conv_x = tfkl.Conv1D(filters=n_feature_maps, kernel_size=8, padding='same')(input_layer)
    conv_x = tfkl.BatchNormalization()(conv_x)
    conv_x = tfkl.Activation('relu')(conv_x)

    conv_y = tfkl.Conv1D(filters=n_feature_maps, kernel_size=5, padding='same')(conv_x)
    conv_y = tfkl.BatchNormalization()(conv_y)
    conv_y = tfkl.Activation('relu')(conv_y)

    conv_z = tfkl.Conv1D(filters=n_feature_maps, kernel_size=3, padding='same')(conv_y)
    conv_z = tfkl.BatchNormalization()(conv_z)

    # expand channels for the sum
    shortcut_y = tfkl.Conv1D(filters=n_feature_maps, kernel_size=1, padding='same')(input_layer)
    shortcut_y = tfkl.BatchNormalization()(shortcut_y)

    output_block_1 = tfkl.add([shortcut_y, conv_z])
    output_block_1 = tfkl.Activation('relu')(output_block_1)

    # BLOCK 2

    conv_x = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_1)
    conv_x = tfkl.BatchNormalization()(conv_x)
    conv_x = tfkl.Activation('relu')(conv_x)

    conv_y = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x)
    conv_y = tfkl.BatchNormalization()(conv_y)
    conv_y = tfkl.Activation('relu')(conv_y)

    conv_z = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y)
    conv_z = tfkl.BatchNormalization()(conv_z)

    # expand channels for the sum
    shortcut_y = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=1, padding='same')(output_block_1)
    shortcut_y = tfkl.BatchNormalization()(shortcut_y)

    output_block_2 = tfkl.add([shortcut_y, conv_z])
    output_block_2 = tfkl.Activation('relu')(output_block_2)

    # BLOCK 3

    conv_x = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_2)
    conv_x = tfkl.BatchNormalization()(conv_x)
    conv_x = tfkl.Activation('relu')(conv_x)

    conv_y = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x)
    conv_y = tfkl.BatchNormalization()(conv_y)
    conv_y = tfkl.Activation('relu')(conv_y)

    conv_z = tfkl.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y)
    conv_z = tfkl.BatchNormalization()(conv_z)

    # no need to expand channels because they are equal
    shortcut_y = tfkl.BatchNormalization()(output_block_2)

    output_block_3 = tfkl.add([shortcut_y, conv_z])
    output_block_3 = tfkl.Activation('relu')(output_block_3)

    # FINAL
    gap_layer = tfkl.GlobalAveragePooling1D()(output_block_3)

    output_layer = tfkl.Dense(classes, activation='softmax')(gap_layer)

    model = tfk.models.Model(inputs=input_layer, outputs=output_layer)

    # Compile the model
    model.compile(loss=tfk.losses.CategoricalCrossentropy(), optimizer=tfk.optimizers.Adam(), metrics='accuracy')

    # Return the model
    return model

In [None]:
def create_values(x_values_v, y_values_v):
  model = LinearRegression()
  x_values = np.array(x_values_v).reshape(-1, 1)
  model.fit(np.log(x_values), y_values_v)
  C1 = model.intercept_
  C2 = model.coef_[0]

  x_list = [round(0.1 * i, 1) for i in range(2, 11)] #To produce a list of values from 0.2 to 1.0
  values = C1 + C2 * np.log(x_list)

  return values

In [None]:
def get_values(file_path):
  with open(file_path, "r") as json_file:
    data = json.load(json_file)

  x_values = []
  y_values = []
  if 'horizontal' in file_path:
    for object_experiment in data:
      y_values.append(object_experiment["Accuracy_6"])
      x_values.append(object_experiment["data_volume"])
  elif 'vertical' in file_path:
    for object_experiment in data:
      y_values.append(object_experiment["Accuracy_6"])
      x_values.append((0.1*object_experiment["num_clients"]))

  y_val = create_values(x_values, y_values) #It retrieves list of values from the curve
  return y_val

In [None]:
def create_dataframe(file_path, meta_data, dataframe, extend):
  with open(file_path, "r") as json_file:
    data = json.load(json_file)

  for object_exp in data:
    first_meta = meta_data[:]
    items_to_append = [object_exp['Accuracy_6'], object_exp['num_clients'], object_exp['data_volume']]
    first_meta.extend(items_to_append)
    dataframe.loc[len(dataframe)] = first_meta

  if extend:
    values = get_values(file_path)
    volume = [i for i in range(2, 11)] # 2, 3, 4,...10
    print(volume)
    for object_val, q_volume in zip(values, volume):
      if object_val > 1.0:
        object_val = 0.99
      first_meta = meta_data[:]
      if 'horizontal' in file_path:
        items_to_append = [object_val, 10, round(0.1  *q_volume, 1)]
      if 'vertical' in file_path:
        items_to_append = [object_val, q_volume, 1.0]
      first_meta.extend(items_to_append)
      dataframe.loc[len(dataframe)] = first_meta

  return dataframe

In [None]:
#Train a Random Forest Algorithm to define a model able to define the number of clients and the data volume per each client given a treashold accuracy value


#Create a dataframe with named columns
columns = ['dataset_type', 'dataset_length', 'length_sequence', 't_model', 'num_param', 'tot_clients', 'accuracy', 'num_clients', 'data_volume']
dataframe = pd.DataFrame(columns=columns)

#Info about ChlorineConcentration
data_chlo = ['Sensor', 3740, 166, 'Resnet', model_info.count_params(), 10] #model_info.count_params should be fixed
data_star = ['Sensor', 7236, 1024, 'Resnet', model_info.count_params(), 10]

files = ['/gdrive/My Drive/Tesi Notebooks/ChlorineConcentration/results/Data Volume/horizontal_results.json', '/gdrive/My Drive/Tesi Notebooks/ChlorineConcentration/results/Data Volume/vertical_results.json', '/gdrive/My Drive/Tesi Notebooks/StarLightCurves/results/Data Volume/results_horizontal.json', '/gdrive/My Drive/Tesi Notebooks/StarLightCurves/results/Data Volume/results_vertical.json']

for file_path in files:
  if 'ChlorineConcentration' in file_path:
    dataframe = create_dataframe(file_path, data_chlo, dataframe, True)
  elif 'StarLightCurves' in file_path:
    dataframe = create_dataframe(file_path, data_star, dataframe, True)


target_columns = columns[-2:]
input_feature = columns[:-2]
X_train_dataframe = dataframe.drop(columns=target_columns)
Y_train_dataframe = dataframe.drop(columns=input_feature)

categorical_columns = ['dataset_type', 't_model']
X_train_dataframe = pd.get_dummies(X_train_dataframe, columns=categorical_columns, prefix=categorical_columns)

X_training, X_testing, Y_training, Y_testing = train_test_split(X_train_dataframe, Y_train_dataframe, test_size=0.1, random_state=42)

#Define the models
model = DecisionTreeRegressor()
model.fit(X_training, Y_training)
print("MSE DecisionTreeRegressor: %.3f" % mean_squared_error(Y_testing, model.predict(X_testing)))

model = DecisionTreeRegressor()
model.fit(X_training, Y_training)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_training, Y_training, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
n_scores = absolute(n_scores)
# summarize performance
print('Kfold Cross Validation DecisionTree MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

model = RandomForestRegressor()
model.fit(X_training, Y_training)
print("MSE RandomForestRegressor: %.3f" % mean_squared_error(Y_testing, model.predict(X_testing)))

model = RandomForestRegressor()
model.fit(X_training, Y_training)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_training, Y_training, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
n_scores = absolute(n_scores)
# summarize performance
print('Kfold Cross Validation RandomForest MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

model = LinearSVR()
wrapper = RegressorChain(model)
wrapper.fit(X_training, Y_training)
print('RegressionChain SVR MSE: %.3f' % mean_squared_error(Y_testing, wrapper.predict(X_testing)))