In [1]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRegressor

from ../utils import get_samples_by_points_num, \
    split_data, split_data_regression, show_scores, \
    visualize_errors, calc_mean_df, show_scores_per_point, add_coordinates

import warnings
warnings.filterwarnings("ignore")

InteractiveShell.ast_node_interactivity = "all"

In [2]:
np.random.seed(42)

In [3]:
df = pd.read_csv('../data/fingerprinting_2.csv', index_col='Unnamed: 0')
df = df.reset_index(drop=True)

In [4]:
df

Unnamed: 0,Server-RSSI-1,Server-RSSI-2,Server-RSSI-3,Server-RSSI-4,Server-RSSI-5,Square,Point,Orientation
0,-55,-71,-69,-57,-58,s0,0,0.0
1,-57,-64,-58,-48,-58,s0,0,0.0
2,-57,-64,-58,-48,-58,s0,0,0.0
3,-57,-64,-57,-48,-58,s0,0,0.0
4,-57,-65,-57,-48,-58,s0,0,0.0
...,...,...,...,...,...,...,...,...
47795,-63,-51,-58,-52,-51,s7,39,3.0
47796,-63,-51,-58,-52,-51,s7,39,3.0
47797,-62,-51,-58,-52,-51,s7,39,3.0
47798,-63,-51,-58,-52,-51,s7,39,3.0


Due to manual collection of data, we lost 1 point in the s1 and 1 in the s2 squares. It doesn\`t affect our set much, so we simply double the last point, since it was in the middle of the square and can barely make representation of squares worse.

In [5]:
grouped = df.groupby(["Square"])

for i in range(12):
    print(len(grouped.get_group('s' + str(i))["Point"]))

4000
3900
3900
4000
4000
4000
4000
4000
4000
4000
4000
4000


In [7]:
fig = px.scatter_3d(df, x="Server-RSSI-1", 
                           y="Server-RSSI-2",
                           z="Server-RSSI-5",
                           color="Square")

fig = fig.update_traces(marker=dict(size=6,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

# Simple Classification

In [8]:
df["Square"]= df["Square"].apply(lambda x:int(x[1:]))

In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data(
    df, points_num=40, train_part=0.7, validation_part=0.15, test_part=0.15)

In [10]:
scaler = MinMaxScaler()
numeric = ['Server-RSSI-1', 'Server-RSSI-2', 'Server-RSSI-3', 'Server-RSSI-4','Server-RSSI-5']
X_train[numeric] = scaler.fit_transform(X_train[numeric])
X_test[numeric] = scaler.transform(X_test[numeric])
X_val[numeric] = scaler.transform(X_val[numeric])

## Support Vector Machine

In [16]:
model_svc = SVC(random_state=42, C=10, kernel="rbf", probability=True).fit(X_train,y_train)
show_scores(model_svc, X_val, y_val)
show_scores_per_point(model_svc, X_val, y_val)

Number of mislabeled points out of a total 7200 points : 5398
Number of points not in top 5 predicted probabilities total points: 7200; failed: 1867
Accuracy of classifier for each square: 
0.34 0.28 0.31 
0.00 0.27 0.44 
0.33 0.09 0.57 
0.00 0.19 0.17 


## Random Forest Classifier

In [17]:
model_rfc = RandomForestClassifier(random_state=42, max_depth=10).fit(X_train,y_train)
show_scores(model_rfc, X_val, y_val)
show_scores_per_point(model_rfc, X_val, y_val)

Number of mislabeled points out of a total 7200 points : 5134
Number of points not in top 5 predicted probabilities total points: 7200; failed: 1364
Accuracy of classifier for each square: 
0.49 0.65 0.31 
0.00 0.19 0.46 
0.20 0.03 0.32 
0.00 0.45 0.33 


## XGBoost Classifier

In [18]:
model_xgb = XGBClassifier(random_state=42, max_depth=10).fit(X_train, y_train)
show_scores(model_xgb, X_val, y_val)
show_scores_per_point(model_xgb, X_val, y_val)

Number of mislabeled points out of a total 7200 points : 5002
Number of points not in top 5 predicted probabilities total points: 7200; failed: 1594
Accuracy of classifier for each square: 
0.50 0.25 0.21 
0.04 0.14 0.63 
0.32 0.12 0.54 
0.17 0.41 0.33 


## Fully-connected Neural Network

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2 

In [20]:
model = Sequential()
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001), input_dim=5))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(12, activation='softmax'))

In [21]:
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', 
                   mode='auto')
model.fit(X_train, to_categorical(y_train), validation_data=(X_val, to_categorical(y_val)),
          epochs=40,
          batch_size=120,
          callbacks=[es])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40


<tensorflow.python.keras.callbacks.History at 0x1b8d90a8880>

In [22]:
score = model.evaluate(X_val, to_categorical(y_val), batch_size=128)



# Classification with sample means

As our data has some noise, we wanted to get rid of it by finding mean. But also we want to have more data for training/testing, so we decided to take golden mean. We divide data for each point (100 samples) into 10 groups. Then take mean of Server-RSSI-X by take mean of 10 points. So, now one point is represented by 10 points instead of 100, but now the representation is more descriptive and consistent. This was considered better option than filtering methods (bayesian/kalman) since our system is static and we have minor noise.

In [23]:
mean_df = calc_mean_df(df, merge_points_num=10)
mean_df

Unnamed: 0,Server-RSSI-1,Server-RSSI-2,Server-RSSI-3,Server-RSSI-4,Server-RSSI-5,Square,Point,Orientation
0,-55.0,-71.0,-69.0,-57.0,-58.0,0.0,0.0,0.0
1,-57.0,-66.0,-57.0,-48.0,-57.0,0.0,0.0,0.0
2,-57.0,-67.0,-57.0,-48.0,-58.0,0.0,0.0,0.0
3,-57.0,-65.0,-57.0,-48.0,-58.0,0.0,0.0,0.0
4,-57.0,-65.0,-57.0,-48.0,-59.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
5,-64.0,-63.0,-48.0,-39.0,-55.0,11.0,39.0,3.0
6,-63.0,-61.0,-48.0,-39.0,-54.0,11.0,39.0,3.0
7,-63.0,-62.0,-48.0,-39.0,-55.0,11.0,39.0,3.0
8,-63.0,-61.0,-48.0,-39.0,-55.0,11.0,39.0,3.0


In [24]:
X_train, y_train, X_val, y_val, X_test, y_test = split_data(
    mean_df, points_num=40, train_part=0.7, validation_part=0.15, test_part=0.15)

In [25]:
scaler = MinMaxScaler()
numeric = ['Server-RSSI-1', 'Server-RSSI-2', 'Server-RSSI-3', 'Server-RSSI-4','Server-RSSI-5']
X_train[numeric] = scaler.fit_transform(X_train[numeric])
X_test[numeric] = scaler.transform(X_test[numeric])
X_val[numeric] = scaler.transform(X_val[numeric])

## Support Vector Machine

In [26]:
model_svc = SVC(random_state=42, C=30, probability=True).fit(X_train,y_train)
show_scores(model_svc, X_val, y_val)
show_scores_per_point(model_svc, X_val, y_val)

Number of mislabeled points out of a total 700 points : 543
Number of points not in top 5 predicted probabilities total points: 700; failed: 129
Accuracy of classifier for each square: 
0.33 0.20 0.16 
0.50 0.02 0.35 
0.33 0.15 0.15 
0.32 0.08 0.08 


## Random Forest Classifier

In [27]:
model_rfc = RandomForestClassifier(random_state=42, max_depth=10).fit(X_train,y_train)
show_scores(model_rfc, X_val, y_val)
show_scores_per_point(model_rfc, X_val, y_val)

Number of mislabeled points out of a total 700 points : 496
Number of points not in top 5 predicted probabilities total points: 700; failed: 135
Accuracy of classifier for each square: 
0.25 0.06 0.08 
0.25 0.18 0.62 
0.33 0.20 0.00 
0.63 0.30 0.52 


In [190]:
# With tuning
rfc_hyperparams = {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100, 'random_state': 42}
tuned_rfc = RandomForestClassifier(**rfc_hyperparams).fit(X_train,y_train)
show_scores(tuned_rfc,X_val,y_val)
show_scores_per_point(tuned_rfc,X_val,y_val)

Number of mislabeled points out of a total 700 points : 496
Number of points not in top 5 predicted probabilities total points: 700; failed: 135
Accuracy of classifier for each square: 
0.25 0.06 0.08 
0.25 0.18 0.62 
0.33 0.20 0.00 
0.63 0.30 0.52 


## XGBoost Classifier

In [28]:
model_xgb = XGBClassifier(random_state=42, max_depth=10).fit(X_train, y_train)
show_scores(model_xgb, X_val, y_val)
show_scores_per_point(model_xgb, X_val, y_val)

Number of mislabeled points out of a total 700 points : 472
Number of points not in top 5 predicted probabilities total points: 700; failed: 195
Accuracy of classifier for each square: 
0.35 0.36 0.36 
0.17 0.23 0.57 
0.20 0.05 0.18 
0.80 0.17 0.48 


In [29]:
xgb_hyperparams ={'booster': 'gbtree',
 'colsample_bytree': 0.9,
 'max_depth': 10,
 'n_estimators': 20,
 'objective': 'multi:softmax',
 'random_state': 42}
tuned_xgb = XGBClassifier(**xgb_hyperparams).fit(X_train,y_train)
show_scores(tuned_xgb,X_val,y_val)
show_scores_per_point(tuned_xgb,X_val,y_val)

Number of mislabeled points out of a total 700 points : 500
Number of points not in top 5 predicted probabilities total points: 700; failed: 172
Accuracy of classifier for each square: 
0.33 0.14 0.20 
0.18 0.28 0.58 
0.20 0.05 0.15 
0.67 0.17 0.43 


## Fully connected Neural Network

In [30]:
model = Sequential()
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001), input_dim=5))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(12, activation='softmax'))

In [None]:
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.2, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='auto')
model.fit(X_train, to_categorical(y_train), validation_data=(X_val, to_categorical(y_val)),
          epochs=200,
          batch_size=128,
          callbacks=[es],
          verbose=0)


In [34]:
score = model.evaluate(X_val, to_categorical(y_val), batch_size=128)



# Summay

As we can see, the results are not the best even with this amount of data. *Golden mean* approach didn\`t bring any improvements. It was even harmful to the Neural Network since it had less data for training. Neural Network handles the noise better then we do with our approach.  
Also in the experiment, we can see that our metric isn\`t informative, since we only now the accuracy of squares classification, but it tells nothing about how far the predicted point was from the real one. So in future work, we need to estimate the real error in the distance metrics (meters/cm).

Fingerprinting is very convenient for data gathering, but in future work, we plan to focus on predicting the exact position, since it will give us more information about accuracy, and results should be easier to explain (including current results). Also for the end-user, it will have much more sense to have a specific position, not the abstract representation of squares.