# BUILDING

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

def postResample_class(y_true, y_preds):
    """
    Given a vector with true results and the predictions of the model, 
    returns the confusion matrix, accuracy, kappa and a report(recall and recap) as a list. 
    """    
    # check the metrics with a confusion matrix
    confusion_matrix = pd.crosstab(y_true, y_preds, rownames=['Real'], colnames=['Pred'])
    print(confusion_matrix)
    print('')

    # print the accuracy
    accuracy = sum(1 for x,y in zip(y_preds, y_true) if x == y) / len(y_true)
    print("The accuracy of that model is: ", round(accuracy,4))

    # kappa 
    kappa = cohen_kappa_score(y1 = y_true, y2 = y_preds)
    print('The kappa of that model is: ', round(kappa,4))
    print('')

    # recall and recap
    report = classification_report(y_true=y_true, y_pred=y_preds) 
    print(report)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('../../data/raw/UJIndoorLoc/trainingData.csv')
valid = pd.read_csv('../../data/raw/UJIndoorLoc/validationData.csv')

In [20]:
wap_names = data.iloc[:,:520].columns

In [21]:
def model_check(temp_data, temp_valid):

    # train and test
    train, test = train_test_split(temp_data, train_size=0.80, random_state=42)

    logReg = LogisticRegression()
    logReg.fit(train[wap_names], train['BUILDINGID'])

    # check the results
    results = postResample_class(
        y_preds=logReg.predict(test[wap_names]), y_true=test['BUILDINGID']
    )
    # check the results
    results = postResample_class(
        y_preds=logReg.predict(temp_valid[wap_names]), y_true=temp_valid['BUILDINGID']
    )

## No pre process

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# train and test
train, test = train_test_split(data, train_size=0.80, random_state=42)

logReg = LogisticRegression()
logReg.fit(train[wap_names], train['BUILDINGID'])

# check the results
results = postResample_class(
    y_preds=logReg.predict(test[wap_names]), y_true=test['BUILDINGID']
)
# check the results
results = postResample_class(
    y_preds=logReg.predict(valid[wap_names]), y_true=valid['BUILDINGID']
)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1001     0
2        0     8  1901

The accuracy of that model is:  0.998
The kappa of that model is:  0.9968

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     536    0    0
1       0  306    1
2       0    0  268

The accuracy of that model is:  0.9991
The kappa of that model is:  0.9986

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## dbm value changes

In [19]:
data[wap_names] = data[wap_names].replace(to_replace=100, value=-105)
valid[wap_names] = valid[wap_names].replace(to_replace=100, value=-105)

In [22]:
model_check(temp_data=data, temp_valid=valid)

Pred     0     1     2
Real                  
0     1078     0     0
1        0  1001     0
2        0     8  1901

The accuracy of that model is:  0.998
The kappa of that model is:  0.9968

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1078
           1       0.99      1.00      1.00      1001
           2       1.00      1.00      1.00      1909

    accuracy                           1.00      3988
   macro avg       1.00      1.00      1.00      3988
weighted avg       1.00      1.00      1.00      3988

Pred    0    1    2
Real               
0     535    1    0
1       0  307    0
2       0    0  268

The accuracy of that model is:  0.9991
The kappa of that model is:  0.9986

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Min and max normalization

In [174]:
norm_df = data.copy()
norm_valid = valid.copy()

norm_df[wap_names] = norm_df[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')
norm_valid[wap_names] = norm_valid[wap_names].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis='columns', result_type='expand')


In [169]:
# removing missing values, by finding out the missing values
norm_df['id'] = norm_df.index + 1

In [176]:
norm_df_clean = norm_df.dropna()
norm_valid_clean = norm_valid.dropna()

In [177]:
len(norm_valid) - len(norm_valid_clean)

0

In [178]:
len(norm_df) - len(norm_df_clean)

76

In [179]:
model_check(temp_data=norm_df_clean, temp_valid=norm_valid_clean)

Pred     0     1     2
Real                  
0     1049     0     0
1        0  1021     0
2        0     0  1903

The accuracy of that model is:  1.0
The kappa of that model is:  1.0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1049
           1       1.00      1.00      1.00      1021
           2       1.00      1.00      1.00      1903

    accuracy                           1.00      3973
   macro avg       1.00      1.00      1.00      3973
weighted avg       1.00      1.00      1.00      3973

Pred    0    1    2
Real               
0     536    0    0
1       0  307    0
2       0    0  268

The accuracy of that model is:  1.0
The kappa of that model is:  1.0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       536
           1       1.00      1.00      1.00       307
           2       1.00      1.00      1.00       268

    accuracy                           1.00 