# Import data

Keep on working on classification problems.

Steps to do: 

* Transform the non detected waps to small values (100 dBm to -110dBm)


In [83]:
import pandas as pd

In [84]:
data = pd.read_csv('../../data/raw/UJIndoorLoc/trainingData.csv')
valid = pd.read_csv('../../data/raw/UJIndoorLoc/validationData.csv')

In [63]:
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

def postResample_class(y_true, y_preds):
    """
    Given a vector with true results and the predictions of the model, 
    returns the confusion matrix, accuracy, kappa and a report(recall and recap) as a list. 
    """    
    # check the metrics with a confusion matrix
    confusion_matrix = pd.crosstab(y_true, y_preds, rownames=['Real'], colnames=['Pred'])
    print(confusion_matrix)
    print('')

    # print the accuracy
    accuracy = sum(1 for x,y in zip(y_preds, y_true) if x == y) / len(y_true)
    print("The accuracy of that model is: ", round(accuracy,4))

    # kappa 
    kappa = cohen_kappa_score(y1 = y_true, y2 = y_preds)
    print('The kappa of that model is: ', round(kappa,4))
    print('')

    # recall and recap
    report = classification_report(y_true=y_true, y_pred=y_preds) 
    print(report)
    
    results = [confusion_matrix, accuracy, kappa, report]
    return results


###############################################################################################
import matplotlib.pyplot as plt
import seaborn as sns

def plot_errors_building(df, y_true, y_pred):
    """
    Given a dataframe, the true values and the predictions for the building, 
    return a scatter plot highlighting the errors
    """
    errors = y_true != y_pred
    data_plot = pd.DataFrame({
        'LONG': df['LONGITUDE'],
        'LAT': df['LATITUDE'],
        'err': errors
    })

    sns.scatterplot(x='LONG', y='LAT', hue='err', data=data_plot, 
                    palette=['lightgrey','red'], x_jitter=True, y_jitter=True)
    plt.title('Plotting building errors')
    plt.rcParams['figure.figsize'] = [20, 10]
    plt.show

In [85]:
wap_names = data.iloc[:,:520].columns

In [76]:
data[wap_names] = data[wap_names].replace(to_replace=100, value=-105)
valid[wap_names] = valid[wap_names].replace(to_replace=100, value=-105)

## BUILDINGID: Check the change in performance 

In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [87]:
# train and test
train, test = train_test_split(data, train_size=0.80, random_state=42)
# tree creation
tree = DecisionTreeClassifier()
tree.fit(train[wap_names], train['BUILDINGID'])
# check the results
results = postResample_class(
    y_preds=tree.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=tree.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1001     0
2        0    10  1899

The accuracy of that model is:  0.9975
The kappa of that model is:  0.9961

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      1.00      1001
           2       1.00      0.99      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     534    2    0
1       1  304    2
2       0    3  265

The accuracy of that model is:  0.9928
The kappa of that model is:  0.9886

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       0.98      0.99      0.99       307
           2       0.99      0.99      0.99       268

    accuracy                    

In [88]:
from sklearn.neighbors import KNeighborsClassifier

# tree creation
KNN = KNeighborsClassifier()
KNN.fit(train[wap_names], train['BUILDINGID'])
# check the results
results = postResample_class(
    y_preds=KNN.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=KNN.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0    1     2
Real                 
0     1078    0     0
1        0  995     6
2        0    1  1908

The accuracy of that model is:  0.9982
The kappa of that model is:  0.9972

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       1.00      0.99      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     534    0    2
1       0  298    9
2       0    0  268

The accuracy of that model is:  0.9901
The kappa of that model is:  0.9844

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      0.97      0.99       307
           2       0.96      1.00      0.98       268

    accuracy                         