# Import data

Keep on working on classification problems.

Steps to do: 

* Transform the non detected waps to small values (100 dBm to -110dBm)


In [124]:
import pandas as pd

In [137]:
data = pd.read_csv('../../data/raw/UJIndoorLoc/trainingData.csv')
valid = pd.read_csv('../../data/raw/UJIndoorLoc/validationData.csv')

In [138]:
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

def postResample_class(y_true, y_preds):
    """
    Given a vector with true results and the predictions of the model, 
    returns the confusion matrix, accuracy, kappa and a report(recall and recap) as a list. 
    """    
    # check the metrics with a confusion matrix
    confusion_matrix = pd.crosstab(y_true, y_preds, rownames=['Real'], colnames=['Pred'])
    print(confusion_matrix)
    print('')

    # print the accuracy
    accuracy = sum(1 for x,y in zip(y_preds, y_true) if x == y) / len(y_true)
    print("The accuracy of that model is: ", round(accuracy,4))

    # kappa 
    kappa = cohen_kappa_score(y1 = y_true, y2 = y_preds)
    print('The kappa of that model is: ', round(kappa,4))
    print('')

    # recall and recap
    report = classification_report(y_true=y_true, y_pred=y_preds) 
    print(report)
    
    results = [confusion_matrix, accuracy, kappa, report]
    return results


###############################################################################################
import matplotlib.pyplot as plt
import seaborn as sns

def plot_errors_building(df, y_true, y_pred):
    """
    Given a dataframe, the true values and the predictions for the building, 
    return a scatter plot highlighting the errors
    """
    errors = y_true != y_pred
    data_plot = pd.DataFrame({
        'LONG': df['LONGITUDE'],
        'LAT': df['LATITUDE'],
        'err': errors
    })

    sns.scatterplot(x='LONG', y='LAT', hue='err', data=data_plot, 
                    palette=['lightgrey','red'], x_jitter=True, y_jitter=True)
    plt.title('Plotting building errors')
    plt.rcParams['figure.figsize'] = [20, 10]
    plt.show

In [139]:
wap_names = data.iloc[:,:520].columns

In [140]:
data[wap_names] = data[wap_names].replace(to_replace=100, value=-105)
valid[wap_names] = valid[wap_names].replace(to_replace=100, value=-105)

## BUILDINGID: Check the change in performance 

In [129]:
from sklearn.model_selection import train_test_split

# train and test
train, test = train_test_split(data, train_size=0.80, random_state=42)

In [130]:
from sklearn.linear_model import LogisticRegression
# tree creation
log_reg = LogisticRegression()
log_reg.fit(train[wap_names], train['BUILDINGID'])
# check the results
results = postResample_class(
    y_preds=log_reg.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=log_reg.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1001     0
2        0     8  1901

The accuracy of that model is:  0.998
The kappa of that model is:  0.9968

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     535    1    0
1       0  307    0
2       0    0  268

The accuracy of that model is:  0.9991
The kappa of that model is:  0.9986

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [114]:
from sklearn.tree import DecisionTreeClassifier
# tree creation
tree = DecisionTreeClassifier()
tree.fit(train[wap_names], train['BUILDINGID'])
# check the results
results = postResample_class(
    y_preds=tree.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=tree.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1000     1
2        0    11  1898

The accuracy of that model is:  0.997
The kappa of that model is:  0.9953

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      0.99      1001
           2       1.00      0.99      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     534    2    0
1       0  304    3
2       0    9  259

The accuracy of that model is:  0.9874
The kappa of that model is:  0.9801

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       0.97      0.99      0.98       307
           2       0.99      0.97      0.98       268

    accuracy                     

In [88]:
from sklearn.neighbors import KNeighborsClassifier

# tree creation
KNN = KNeighborsClassifier()
KNN.fit(train[wap_names], train['BUILDINGID'])
# check the results
results = postResample_class(
    y_preds=KNN.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=KNN.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0    1     2
Real                 
0     1078    0     0
1        0  995     6
2        0    1  1908

The accuracy of that model is:  0.9982
The kappa of that model is:  0.9972

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       1.00      0.99      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     534    0    2
1       0  298    9
2       0    0  268

The accuracy of that model is:  0.9901
The kappa of that model is:  0.9844

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      0.97      0.99       307
           2       0.96      1.00      0.98       268

    accuracy                         

## FLOOR

In [141]:
# train and test
train, test = train_test_split(data, train_size=0.80, random_state=42)

In [142]:
# tree creation
log_reg = LogisticRegression()
log_reg.fit(train[wap_names], train['FLOOR'])
# check the results
results = postResample_class(
    y_preds=log_reg.predict(test[wap_names]), y_true=test['FLOOR']
)
# check the results
results = postResample_class(
    y_preds=log_reg.predict(valid[wap_names]), y_true=valid['FLOOR']
)

Pred    0    1    2    3    4
Real                         
0     850   20    8    8    0
1       6  994   10    0    0
2       3    7  862   10    0
3       0    1   15  978    4
4       0    0    0    6  206

The accuracy of that model is:  0.9754
The kappa of that model is:  0.9682

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       886
           1       0.97      0.98      0.98      1010
           2       0.96      0.98      0.97       882
           3       0.98      0.98      0.98       998
           4       0.98      0.97      0.98       212

    accuracy                           0.98      3988
   macro avg       0.98      0.97      0.98      3988
weighted avg       0.98      0.98      0.98      3988

Pred    0    1    2    3   4
Real                        
0     124    7    1    0   0
1      22  405   35    0   0
2       1    7  281   16   1
3       0    0    6  163   3
4       1    0    0    9  29

The accuracy of that

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [104]:
# tree creation
tree = DecisionTreeClassifier()
tree.fit(train[wap_names], train['FLOOR'])
# check the results
results = postResample_class(
    y_preds=tree.predict(test[wap_names]), y_true=test['FLOOR']
)
# check the results
results = postResample_class(
    y_preds=tree.predict(valid[wap_names]), y_true=valid['FLOOR']
)

Pred    0    1    2    3    4
Real                         
0     850   22    2   11    1
1      26  936   36   12    0
2       6   27  820   29    0
3       3    9   27  958    1
4       0    0    0    7  205

The accuracy of that model is:  0.9451
The kappa of that model is:  0.9289

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       886
           1       0.94      0.93      0.93      1010
           2       0.93      0.93      0.93       882
           3       0.94      0.96      0.95       998
           4       0.99      0.97      0.98       212

    accuracy                           0.95      3988
   macro avg       0.95      0.95      0.95      3988
weighted avg       0.95      0.95      0.95      3988

Pred    0    1    2    3   4
Real                        
0     109   13   10    0   0
1      51  306   88   17   0
2       6   23  209   68   0
3       0    3   30  135   4
4       1    0    2   25  11

The accuracy of that

In [106]:
# tree creation
KNN = KNeighborsClassifier()
KNN.fit(train[wap_names], train['FLOOR'])
# check the results
results = postResample_class(
    y_preds=KNN.predict(test[wap_names]), y_true=test['FLOOR']
)
# check the results
results = postResample_class(
    y_preds=KNN.predict(valid[wap_names]), y_true=valid['FLOOR']
)

Pred    0    1    2    3    4
Real                         
0     883    3    0    0    0
1      45  958    7    0    0
2       3   17  831   31    0
3       6    1    8  980    3
4       0    0    0    0  212

The accuracy of that model is:  0.9689
The kappa of that model is:  0.9597

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       886
           1       0.98      0.95      0.96      1010
           2       0.98      0.94      0.96       882
           3       0.97      0.98      0.98       998
           4       0.99      1.00      0.99       212

    accuracy                           0.97      3988
   macro avg       0.97      0.97      0.97      3988
weighted avg       0.97      0.97      0.97      3988

Pred    0    1    2    3   4
Real                        
0     116    9    6    1   0
1      80  318   63    1   0
2      12   17  192   85   0
3       3    0   13  155   1
4       7    0    0    9  23

The accuracy of that