# Experimental Setup

This notebook will use the data obtained from [GNFUV Unmanned Surface Vehicles Sensor Data Set 2 Data Set](https://archive.ics.uci.edu/ml/datasets/GNFUV+Unmanned+Surface+Vehicles+Sensor+Data+Set+2). This data set contains eight (2x4) data sets of mobile sensor readings data (humidity, temperature) corresponding to a swarm of four Unmanned Surface Vehicles (USVs) in a test-bed, Athens, Greece.

**Dataset characteristics**: Multivariate and Sequential 

**Attributes**:
* 'device' = USV ID (String)
* 'humidity' = sensed humidity value from the USV sensor (real value)
* temperature' = sensed temperature value from the USV sensor (real value)
* 'experiment' = 1 (constant real value)
* 'time' = the sensing and reporting time (real value)
* 'pi' = Raspberry Pi ID



In [None]:
# Global variable to control if activate all the verbose will be plotted.
debug=False

In [None]:
import os
import json
from datetime import datetime
from json import JSONDecodeError
import pandas as pd
import numpy as np

def generate_dataset(home_dir,dataset_name):
    
    """generate_dataset: This function assumes the data is 
    distributed in folders from a parent directory. 
    Each folder contains comma-separated files where each 
    row is defined with a JSON notation.
    Arguments:
        home_dir [string] -- parent directory
        dataset_name [ [string] ] -- Target folder name
    Returns:
        {data_df} -- Pandas dataframe with all the information joined,
                    cleaned and ready to use.
    """
    home_dir = home_dir +"/" + dataset_name
    pi_dirs = os.listdir(home_dir)
    data_list = []
    columns = None
    
    for pi_dir in pi_dirs:
        if 'pi' not in pi_dir:
            continue
        curr_dir = os.path.join(home_dir, pi_dir)
        data_file = os.path.join(curr_dir, os.listdir(curr_dir)[0])
        with open(data_file, 'r') as f:
            line = f.readline().strip().replace("'", '"')
            while line != '':
                try:
                    input_json = json.loads(line)
                    sensor_datetime = datetime.fromtimestamp(input_json['time'])
                    input_json['time'] = sensor_datetime
                    input_json['pi'] = pi_dir
                    data_list.append(list(input_json.values()))
                    if columns is None:
                        columns = list(input_json.keys())
                except JSONDecodeError as je:
                    pass
                line = f.readline().strip().replace("'", '"')
    
    data_df = pd.DataFrame(data_list, columns=columns)
    
    #@TODO: Include an argument to decide the target columns we want in the final dataset.
    del data_df['experiment']
    del data_df['device']
    del data_df['time']
    del data_df['pi']
    data_df = data_df.replace(to_replace=' None', value=np.nan).dropna()
    return data_df

In [None]:
home_dir = '../datasets'
dataset = 'GNFUV-USV-Dataset'

# This variable contains the name of the columns and represents the variables we want to study
target_columns=['temperature','humidity']

# D represents the global dataset (all the points).
D = generate_dataset(home_dir,dataset)

if debug:
    for t in target_columns:
        print(D[t].value_counts())

### Distribution of D

In [None]:
import seaborn as sns
# Plot the distribution (2D)
sns.pairplot(D, height=2.5);

In [None]:
# Indicies to access row values
Xtrain=0
Xtest=1
Ytrain=2
Ytest=3

def split_data(x,y, test_size=0.2):
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=test_size,random_state=42)
    return [x_train,x_test,y_train,y_test]

def plot_SVR(tx,ty,py,name='', x_name='humidity',y_name='temperature'):
    from sklearn.metrics import mean_squared_error
    import matplotlib.pyplot as plt
    
    if debug:
        print("X (queries): " + str(tx))
        print("Y (expected): " + str(ty))
        print("Y (predicted): " + str(py))
    
    plt.scatter(tx, ty, c='tab:green', label='original')
    plt.plot(tx, py, c='tab:red', label='predicted')
    
    plt.xlabel(x_name)
    plt.ylabel(y_name)
    plt.title('Support Vector Regression ' +str(name))
    plt.legend()
    plt.show()

    print("MSE:", mean_squared_error(ty, py))
    

In [None]:
# _D represents [x_train,x_test,y_train,y_test] with data from D
X=D.copy().drop(columns=["temperature"])
y=D["temperature"].copy()
_D = split_data(X,y,test_size=0.2)
#_D = split_data(D,target_columns[1],target_columns[0],test_size=0.2)

# M represent the big brother model with a full vision of all the data
from sklearn.svm import SVR
model = SVR(kernel='rbf', C=1, gamma=0.1, epsilon=0.1)

# For this experminet X=represent humidity and y=temperature
X=_D[Xtrain].values.reshape(-1, 1)
y=_D[Ytrain].values

M=model.fit(X, y)

x = _D[Xtest].values.reshape(-1, 1)
py = M.predict(x)

plot_SVR(x,_D[Ytest],py,x_name='humidity',y_name='temperature')


### Data generation
This section aims to prepare the dataset to work with. The idea is to split the dataset into different groups to represent a node in our edge layer.

So, we want to represent our edge layer as a set of nodes ($n \in N$).
Each node $n$ in the edge layer will contain a unique subset of points from the full dataset $D$. Therefore, $D_{n}\subset D \;,\; \forall n \in N\;|\; D_n\not\subset D_m\;, \; \forall m \in N \;, \;m \neq n$.

This way, we use **KMeans** to generate different clusters and assign each cluster to a different node. *Note*: $K=N$

In [None]:
# Needed for generating classification, regression and clustering datasets
import sklearn.datasets as dt
from sklearn.cluster import KMeans

def generate_data(D,K,x_name,y_name):
    from pandas import DataFrame
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans

    kmeans = KMeans(n_clusters=K).fit(D)
    centroids = kmeans.cluster_centers_
    
    # We build a dictionary for each cluster with the corresponding dataframe using the position obtained in Kmeans
    # We split the x,y into train and test sets
    
    D_ = {}
    
    samples = {kmeans.cluster_centers_[i, 0]: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}
 
    for k in samples.keys():
        D_[k]={}
        d = D.iloc[samples[k], :]
        D_[k]["full"]=d
        X=d.copy().drop(columns=["temperature"])
        y=d["temperature"].copy()
        D_[k]["split"]=split_data(X,y,test_size=0.2)
        
    return centroids, D_
    

def plot_nodes(D,centroids,x_name,y_name):
    labels = ['Node{0}'.format(i) for i in range(len(centroids))]
    plt_data = plt.scatter(D[x_name], D[y_name], c= kmeans.labels_.astype(float))
    plt.colorbar()
    plt.scatter(centroids[:, 0], centroids[:, 1], marker='x')
    plt.title('K-means clustering')
    plt.xlabel(x_name)
    plt.ylabel(y_name)
    labels = ['Node{0}'.format(i) for i in range(K)]
    for i in range (K):
        xy=(centroids[i, 0],centroids[i, 1])
        plt.annotate(labels[i],xy, horizontalalignment='right', verticalalignment='top',
              bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.95', 
                                color='b'))

    plt.show()    

In [None]:
K=10
centroids,D_ = generate_data(D,K,target_columns[1],target_columns[0])

Let us check the different distributions of each node concerning the two variables studied, humidity and temperature.

In [None]:
i=0
for k in D_.keys():
    g = sns.pairplot(D_[k]["full"], height=2.5);
    g.fig.suptitle("Node "+str(i)) 
    i=i+1

In [None]:
m={}
for k in D_.keys():
    # For this experminet X=represent humidity and y=temperature
    X = D_[k]["split"][Xtrain].values.reshape(-1, 1)
    y = D_[k]["split"][Ytrain].values
    m[k] =  model.fit(X,y)
    x = D_[k]["split"][Xtest].values.reshape(-1, 1)
    py = m[k].predict(x)
    plot_SVR(x,D_[k]["split"][Ytest].values,py,x_name='humidity',y_name='temperature')

In [None]:
def generate_outsiders(input_Ds, strategy, outer):
    output_Ds={}
    if (strategy == 'percentile' or strategy == 'random'):
        percentile = ((100/outer)/100) - 0.01
        for k in input_Ds.keys():
            output_Ds[k]=input_Ds[k].copy()
            for k1 in input_Ds.keys():
                if (k != k1):

                    for o in range(outer):
                        dk1 = output_Ds[k]["full"].copy()
            
                        if (strategy == 'percentile'):
                            v =  percentile + (percentile*o)
                            vDf=pd.DataFrame(dk1.quantile(v)).T
                        elif (strategy == 'random'):
                            vDf = dk1.sample()
                            
                        dfX=output_Ds[k]["split"][Xtrain]
                        dfY=output_Ds[k]["split"][Ytrain]
                        
                        dfX.loc[len(dfX.index)] = vDf.iloc[:,1].values[0]
                        dfY.loc[len(dfY.index)] = vDf.iloc[:,0].values[0]
                                                              
    else:
        raise ValueError("Wrong strategy. Strategy must be ['random','percentile']")
             
    return output_Ds

def get_MSE(model,y_expected,x):
    from sklearn.metrics import mean_squared_error
    y_pred = model.predict(x)
    return mean_squared_error(y_expected.reshape(-1, 1), y_pred)


# Sensitivity analysis to evaluate the impact of the size in the outer points set.
def do_sa_outer(outersList,ds,model,M):
    experiments={}
    for outer in outersList:
    
    # Step 1: Generate the outsiders using the outer value at each iteration
    
        # Step 1.1: Using the percentile strategy
        nodes_dataset_with_outsiders = generate_outsiders(ds,'percentile',outer)
        # Step 1.2: Using the random strategy
        nodes_dataset_with_outsiders_random = generate_outsiders(ds,'random',outer)

        nodes_model={}
        experiments[outer] = {}

        # Step 2. We want to evaluate all the testdata, so we perform a test at each node 
        for k in ds.keys():
            experiments[outer][k] = {}
            # Step 2.1. Build test data from node k
            # Step 2.1.1.  Test data from node k (variable x)
            x = ds[k]["split"][Xtest].values.reshape(-1, 1) 
            # Step 2.1.2. Expected data from node k (variable y)
            y_expected=ds[k]["split"][Ytest].values
            
            # Step 2.2. Train model without outsiders (only their data)
            for k1 in ds.keys():
                 X = ds[k]["split"][Xtrain].values.reshape(-1, 1)
                 y = ds[k]["split"][Ytrain].values
 
                 nodes_model[k1] =  model.fit(X,y)

            # Step 3: Make predictions

            # 3.1 Big brother model
            experiments[outer][k]['full']=get_MSE(M,y_expected,x)

            experiments[outer][k]['percentile']=[]
            experiments[outer][k]['random']=[]
            experiments[outer][k]['node']=[]
            nodes_with_outsiders_model={}
            nodes_with_outsiders_random={}

            # 3.2 Models in the nodes
            for k1 in ds.keys():
                X = nodes_dataset_with_outsiders[k1]["split"][Xtrain].values.reshape(-1, 1)
                y = nodes_dataset_with_outsiders[k1]["split"][Ytrain].values
                nodes_with_outsiders_model[k1] =  model.fit(X,y)
                X = nodes_dataset_with_outsiders_random[k1]["split"][Xtrain].values.reshape(-1, 1)
                y = nodes_dataset_with_outsiders_random[k1]["split"][Ytrain].values
                nodes_with_outsiders_random[k1] = model.fit(X,y)

                # 3.3 using node data
                experiments[outer][k]['node'].append(get_MSE(nodes_model[k1],y_expected,x))

                # 3.1 percentile strategy
                experiments[outer][k]['percentile'].append(get_MSE(nodes_with_outsiders_model[k1],y_expected,x))

                # 3.2 random strategy
                experiments[outer][k]['random'].append(get_MSE(nodes_with_outsiders_random[k1],y_expected,x))

    return experiments

In [None]:
outersList = [5,7,10,15,17,20,25,30]
experiment1=do_sa_outer(outersList,D_,model,M)
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(experiment1)