# BUILDING

In [318]:
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

def postResample_class(y_true, y_preds):
    """
    Given a vector with true results and the predictions of the model, 
    returns the confusion matrix, accuracy, kappa and a report(recall and recap) as a list. 
    """    
    # check the metrics with a confusion matrix
    confusion_matrix = pd.crosstab(y_true, y_preds, rownames=['Real'], colnames=['Pred'])
    print(confusion_matrix)
    print('')

    # print the accuracy
    accuracy = sum(1 for x,y in zip(y_preds, y_true) if x == y) / len(y_true)
    print("The accuracy of that model is: ", round(accuracy,4))

    # kappa 
    kappa = cohen_kappa_score(y1 = y_true, y2 = y_preds)
    print('The kappa of that model is: ', round(kappa,4))
    print('')

    # recall and recap
    report = classification_report(y_true=y_true, y_pred=y_preds) 
    print(report)

In [319]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('../../data/raw/UJIndoorLoc/trainingData.csv')
valid = pd.read_csv('../../data/raw/UJIndoorLoc/validationData.csv')

In [320]:
wap_names = data.iloc[:,:520].columns

In [321]:
def model_check(temp_data, temp_valid, predict):

    # train and test
    train, test = train_test_split(temp_data, train_size=0.80, random_state=42)

    logReg = LogisticRegression()
    logReg.fit(train[wap_names], train[predict])

    # check the results
    results = postResample_class(
        y_preds=logReg.predict(test[wap_names]), y_true=test[predict]
    )
    # check the results
    results = postResample_class(
        y_preds=logReg.predict(temp_valid[wap_names]), y_true=temp_valid[predict]
    )
    
    return logReg

## No pre process

In [322]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# train and test
train, test = train_test_split(data, train_size=0.80, random_state=42)

logReg = LogisticRegression()
logReg.fit(train[wap_names], train['BUILDINGID'])

# check the results
results = postResample_class(
    y_preds=logReg.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=logReg.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1001     0
2        0     8  1901

The accuracy of that model is:  0.998
The kappa of that model is:  0.9968

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     536    0    0
1       0  306    1
2       0    0  268

The accuracy of that model is:  0.9991
The kappa of that model is:  0.9986

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## dbm value changes

In [323]:
data[wap_names] = data[wap_names].replace(to_replace=100, value=-105)
valid[wap_names] = valid[wap_names].replace(to_replace=100, value=-105)

In [324]:
model_check(temp_data=data, temp_valid=valid)

TypeError: model_check() missing 1 required positional argument: 'predict'

## Min and max normalization

In [None]:
norm_df = data.copy()
norm_valid = valid.copy()

norm_df[wap_names] = norm_df[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')
norm_valid[wap_names] = norm_valid[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')

In [None]:
# removing missing values, by finding out the missing values
norm_df['id'] = norm_df.index + 1

In [None]:
norm_df_clean = norm_df.dropna()
norm_valid_clean = norm_valid.dropna()

In [None]:
len(norm_valid) - len(norm_valid_clean)

In [None]:
len(norm_df) - len(norm_df_clean)

In [None]:
model_check(temp_data=norm_df_clean, temp_valid=norm_valid_clean, predict='BUILDINGID')

## FLOOR

In [None]:
model_check(temp_data=norm_df_clean, temp_valid=norm_valid_clean, predict='FLOOR')

## Duplicated values?

In [None]:
dupl_df = norm_df_clean.copy()
dupl_val = norm_valid_clean.copy()

In [None]:
# select observations based on unique values for LATITUDE and LONGITUDE
dupl_df = dupl_df.drop_duplicates(['LATITUDE', 'LONGITUDE'])

In [None]:
model_check(temp_data=dupl_df, temp_valid=dupl_val, predict='BUILDINGID')

In [None]:
model_check(temp_data=dupl_df, temp_valid=dupl_val, predict='FLOOR')

## Merge building and floor to create a new feature 

In [None]:
norm_df_clean['BUILDINGID_FLOOR'] = norm_df_clean['BUILDINGID'].astype(str) + "_" + norm_df_clean['FLOOR'].astype(str)
norm_valid_clean['BUILDINGID_FLOOR'] = norm_valid_clean['BUILDINGID'].astype(str) + "_" + norm_valid_clean['FLOOR'].astype(str)

In [None]:
model_check(temp_data=norm_df_clean, temp_valid=norm_valid_clean, predict='BUILDINGID_FLOOR')

In [None]:
dupl_df['BUILDINGID_FLOOR'] = dupl_df['BUILDINGID'].astype(str) + "_" + dupl_df['FLOOR'].astype(str)
dupl_val['BUILDINGID_FLOOR'] = dupl_val['BUILDINGID'].astype(str) + "_" + dupl_val['FLOOR'].astype(str)

In [325]:
model = model_check(temp_data=dupl_df, temp_valid=dupl_val, predict='BUILDINGID_FLOOR')

Pred  0_0  0_1  0_2  0_3  1_0  1_1  1_2  1_3  2_0  2_1  2_2  2_3  2_4
Real                                                                 
0_0     5    0    0    0    0    0    0    0    0    0    0    0    0
0_1     1    8    0    0    0    0    0    0    0    0    0    0    0
0_2     0    1    5    0    0    0    0    0    0    0    0    0    0
0_3     0    0    0    4    0    0    0    0    0    0    0    0    0
1_0     0    0    0    0   17    1    0    0    0    0    0    0    0
1_1     0    0    0    0    0   10    0    0    0    0    0    0    0
1_2     0    0    0    0    0    0   14    0    0    0    0    0    0
1_3     0    0    0    0    0    0    0    9    0    0    0    0    0
2_0     0    0    0    0    0    0    0    0   13    0    0    0    0
2_1     0    0    0    0    0    0    0    0    0    9    0    0    0
2_2     0    0    0    0    0    0    0    0    0    0    8    0    0
2_3     0    0    0    0    0    0    0    0    0    0    0   29    0
2_4     0    0    0 

In [283]:
preds = model.predict(dupl_val[wap_names])

In [284]:
valid_check = dupl_val[['BUILDINGID','FLOOR']].copy()

In [285]:
valid_check['preds'] = preds

In [286]:
preds_col = valid_check['preds'].str.split('_', n=2, expand=True).rename(columns={0: 'preds_b', 1: 'preds_f'})

In [291]:
valid_check = pd.concat([valid_check, preds_col], axis=1, ignore_index=False)

In [293]:
postResample_class(y_preds=valid_check['preds_b'].astype(float), y_true=valid_check['BUILDINGID'])

Pred  0.0  1.0  2.0
Real               
0     536    0    0
1       0  307    0
2       0    1  267

The accuracy of that model is:  0.9991
The kappa of that model is:  0.9986

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                           1.00      1111
   macro avg       1.00      1.00      1.00      1111
weighted avg       1.00      1.00      1.00      1111



In [295]:
postResample_class(y_preds=valid_check['preds_f'].astype(float), y_true=valid_check['FLOOR'])

Pred  0.0  1.0  2.0  3.0  4.0
Real                         
0     122   10    0    0    0
1      19  427   16    0    0
2       2   23  275    5    1
3       0    0   11  161    0
4       1    0    0    9   29

The accuracy of that model is:  0.9127
The kappa of that model is:  0.8774

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       132
           1       0.93      0.92      0.93       462
           2       0.91      0.90      0.90       306
           3       0.92      0.94      0.93       172
           4       0.97      0.74      0.84        39

    accuracy                           0.91      1111
   macro avg       0.91      0.89      0.90      1111
weighted avg       0.91      0.91      0.91      1111



Based on the previous analysis, we can use the logistic regression to predict the building and then, create a different model by building

## Summarising the best steps and store the result

In [308]:
# summarise all the steps done since now
# 1. load the data
import pandas as pd

data = pd.read_csv('../../data/raw/UJIndoorLoc/trainingData.csv')
valid = pd.read_csv('../../data/raw/UJIndoorLoc/validationData.csv')
wap_names = data.iloc[:,:520].columns.tolist()

# 2. transform the values from 100dBm to -105dBm
data[wap_names] = data[wap_names].replace(to_replace=100, value=-105)
valid[wap_names] = valid[wap_names].replace(to_replace=100, value=-105)


# 3. Normalize the waps by row by min and max
norm_df = data.copy()
norm_valid = valid.copy()

norm_df[wap_names] = norm_df[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')
norm_df['id'] = str(norm_df.index + 1)
norm_df_clean = norm_df.dropna() # missing values are created in 76 rows, as they have 0 variance (further exploration)
norm_valid[wap_names] = norm_valid[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')


# 4. Store results
norm_df_clean.to_csv('../../data/clean/norm_training.csv')
norm_valid.to_csv('../../data/clean/norm_validation.csv')

## Deal with bias

In [217]:
# main problems are placed in:
#     * building 1 floor 0. Accuracy 60%
#     * building 2 floor 3 Accuracy 75%
dupl_df.groupby(['BUILDINGID_FLOOR']).size()

BUILDINGID_FLOOR
0_0     54
0_1     63
0_2     26
0_3     20
1_0     75
1_1     48
1_2     75
1_3     40
2_0     58
2_1     59
2_2     36
2_3    110
2_4     28
dtype: int64