# Wifi project

## loading packages 

In [3]:
library(readr)                                                                       # Reading data set
library(caret)                                                                       # Prediction
library(dplyr)                                                                       # Data manipulation
library(ggplot2)                                                                     # Plotting
library(plotly)                                                                      # Plotting
library(learningr)                                                                    # calculate Hypotenuse

## Training data

### reading data

In [4]:
rawdata <- read.csv("C:/Users/spxt6/OneDrive/Dokumente/R_projects/Test.Proj/WIFI/trainingData.csv", TRUE, sep =",")
WifiTrain <- rawdata

In [5]:
getwd()

In [7]:
head(WifiTrain)

WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
100,100,100,100,100,100,100,100,100,100,...,100,-7541.264,4864921,2,1,106,2,2,23,1371713733
100,100,100,100,100,100,100,100,100,100,...,100,-7536.621,4864934,2,1,106,2,2,23,1371713691
100,100,100,100,100,100,100,-97,100,100,...,100,-7519.152,4864950,2,1,103,2,2,23,1371714095
100,100,100,100,100,100,100,100,100,100,...,100,-7524.57,4864934,2,1,102,2,2,23,1371713807
100,100,100,100,100,100,100,100,100,100,...,100,-7632.144,4864982,0,0,122,2,11,13,1369909710
100,100,100,100,100,100,100,100,100,100,...,100,-7533.896,4864939,2,1,105,2,2,23,1371713841


### Preprocessing the training data set

In [8]:
# Creating dataset only with WAPs in Training data
WifiTrainOnlyWAPs <- as.data.frame(WifiTrain)
WifiTrainOnlyWAPs$LATITUDE <- NULL
WifiTrainOnlyWAPs$FLOOR <- NULL
WifiTrainOnlyWAPs$LONGITUDE <- NULL
WifiTrainOnlyWAPs$RELATIVEPOSITION <- NULL
WifiTrainOnlyWAPs$SPACEID <- NULL
WifiTrainOnlyWAPs$USERID <- NULL
WifiTrainOnlyWAPs$BUILDINGID <- NULL
WifiTrainOnlyWAPs$PHONEID <- NULL
WifiTrainOnlyWAPs$TIMESTAMP <- NULL

In [9]:
# Replacing Non detected data (100) in Training datawith NA
WifiTrainOnlyWAPs[WifiTrainOnlyWAPs == 100] <- NA

In [10]:
# Excluding 0 variance columns
WifiTrainNoNAWap <- WifiTrainOnlyWAPs[ , colSums(is.na(WifiTrainOnlyWAPs)) < nrow(WifiTrainOnlyWAPs)]

In [11]:
# Replacing NA values with -105
WifiTrainNoNAWap[is.na(WifiTrainNoNAWap)] <- -105

In [12]:
# Creating data set in Training data with Location, User and time information Without 0 variance columns
WifiTrainNoNA529col <- as.data.frame(WifiTrainNoNAWap)
WifiTrainNoNA529col$LATITUDE <- WifiTrain$LATITUDE
WifiTrainNoNA529col$TIMESTAMP <- WifiTrain$TIMESTAMP
WifiTrainNoNA529col$LONGITUDE <- WifiTrain$LONGITUDE
WifiTrainNoNA529col$SPACEID <- as.character(WifiTrain$SPACEID)
WifiTrainNoNA529col$USERID <- as.character(WifiTrain$USERID)
WifiTrainNoNA529col$PHONEID <- as.character(WifiTrain$PHONEID)
WifiTrainNoNA529col$BUILDINGID <- as.character(WifiTrain$BUILDINGID)
WifiTrainNoNA529col$RELATIVEPOSITION <- as.character(WifiTrain$RELATIVEPOSITION)
WifiTrainNoNA529col$FLOOR <- as.character(WifiTrain$FLOOR)

In [13]:
# Preparing Matrix of Training data for PCAs
WifiTrainForPCA <- as.matrix(WifiTrainNoNAWap)

## Test(Validation) data

### Reading data

In [14]:
rawdataTest <- read.csv("C:/Users/spxt6/OneDrive/Dokumente/R_projects/Test.Proj/WIFI/validationData.csv", TRUE, sep = ",")
WifiTest <- rawdataTest

### Preprocessing Validation dataset

In [15]:
# Replacing non-detected data (100) with -105
WifiTestNoNA529col <- WifiTest
WifiTestNoNA529col[WifiTestNoNA529col == 100] <- -105

In [16]:
# Preparing data set with Validation data for PCA
WifiTestNoNA529colForPCA <- as.data.frame(WifiTestNoNA529col)
WifiTestNoNA529colForPCA$LATITUDE <- NULL
WifiTestNoNA529colForPCA$FLOOR <- NULL
WifiTestNoNA529colForPCA$LONGITUDE <- NULL
WifiTestNoNA529colForPCA$RELATIVEPOSITION <- NULL
WifiTestNoNA529colForPCA$SPACEID <- NULL
WifiTestNoNA529colForPCA$USERID <- NULL
WifiTestNoNA529colForPCA$BUILDINGID <- NULL
WifiTestNoNA529colForPCA$PHONEID <- NULL
WifiTestNoNA529colForPCA$TIMESTAMP <- NULL

In [17]:
# Converting data set into matrix for PCAs
WifiTestForPCA <- as.matrix(WifiTestNoNA529colForPCA)

In [18]:
# Matching the columns of Training and Valicdation Data for No PCAs
WifiTestNoNA529colMatch <- WifiTestNoNA529col[,match(colnames(WifiTrainNoNA529col), colnames(WifiTestNoNA529col))]

In [19]:
# Data set in validation with Location, User and time indormation for No PCAs
WifiTestNoNA529colMatch <- as.data.frame(WifiTestNoNA529colMatch)
WifiTestNoNA529colMatch$LATITUDE <-  WifiTestNoNA529col$LATITUDE
WifiTestNoNA529colMatch$TIMESTAMP <-  WifiTestNoNA529col$TIMESTAMP
WifiTestNoNA529colMatch$LONGITUDE <- WifiTestNoNA529col$LONGITUDE
WifiTestNoNA529colMatch$SPACEID <-  as.character(WifiTestNoNA529col$SPACEID)
WifiTestNoNA529colMatch$USERID <-  as.character(WifiTestNoNA529col$USERID)
WifiTestNoNA529colMatch$PHONEID <-  as.character(WifiTestNoNA529col$PHONEID)
WifiTestNoNA529colMatch$BUILDINGID <- as.character(WifiTestNoNA529col$BUILDINGID)
WifiTestNoNA529colMatch$RELATIVEPOSITION <-as.character(WifiTestNoNA529col$RELATIVEPOSITION)
WifiTestNoNA529colMatch$FLOOR <- as.character(WifiTestNoNA529col$FLOOR)

## PCAs 

In [20]:
# Naming the rows of Training data
rownames(WifiTrainForPCA) <- paste0("Observation", 1:nrow(WifiTrainForPCA))

In [21]:
# Computing PCA in Train Data
PCATrain <- prcomp(WifiTrainForPCA, scale = TRUE)

In [22]:
# Computing Variance and Standard deviation of PCs
PCATrainVar <- PCATrain$sdev^2

In [23]:
# Showing 90 % of variances and corresponding columns 
sum(PCATrainVar)*0.9
sum(PCATrainVar[1:206])

In [24]:
# Extracting df with 206 Principal Components for the Training set
Train_PCs <- as.data.frame(PCATrain$x)
Train_PCs_206 <- Train_PCs %>% select(1:206)

In [25]:
# Extracting df with 206 Principal Components for the Validation set
TEST_PCs <- as.data.frame(predict(PCATrain, newdata = WifiTestNoNA529colForPCA))
TEST_PCs_206 <- TEST_PCs %>% select(1:206) 

## Modeling

### Modeling for Longitude : For Longitude, PCA was applied

In [26]:
# Setting the data
Train_PCs_206$LONGITUDE <- WifiTrain$LONGITUDE
TEST_PCs_206$LONGITUDE <- WifiTestNoNA529col$LONGITUDE

In [27]:
# Splitting the data into training and testing set
set.seed(123)
Train_PCs_206_Sample <- Train_PCs_206[sample(1:nrow(Train_PCs_206),5000, replace = FALSE),]
inTraining <- createDataPartition(Train_PCs_206_Sample$LONGITUDE, p = .75, list = FALSE)
training <- Train_PCs_206_Sample[inTraining,]
testing <- Train_PCs_206_Sample[-inTraining,]

In [28]:
# Cross validation within training set
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

In [29]:
# Modeling for LONGITUDE with random forest
rfGrid <- expand.grid(mtry = c(63))
rfmodelLONGI <- train(LONGITUDE ~ ., data = training,method = "rf", tuneGrid = rfGrid, trControl = fitControl)

### Prediction for Longitude 

In [30]:
# Pprediction for longitude with testing data
rfPredictionModelLONGI <- predict(rfmodelLONGI, testing)

In [31]:
# Checking the accuracy of the model with testing set
postResample(rfPredictionModelLONGI, testing$LONGITUDE)

In [32]:
# Prediction for longitude with Valifation data
rfPredictionLONGI <- predict(rfmodelLONGI, TEST_PCs_206)

In [33]:
# Checking the accuracy of the model with validation set
postResample(rfPredictionLONGI, WifiTestNoNA529col$LONGITUDE)

### Modeling for Floor : For Floor, Building and Latitude predition, PCA was not applied ####

In [34]:
# Setting Data
WifiTrainNoNAModeling <- WifiTrainNoNAWap
WifiTrainNoNAModeling$FLOOR <- as.factor(WifiTrain$FLOOR)
WifiTestNoNA529colMatch$FLOOR <- as.factor(WifiTestNoNA529col$FLOOR)

In [35]:
# Splitting the data into Training and Testing
set.seed(123)
WifiTrainNoNAModeling_Sample <- WifiTrainNoNAModeling[sample(1:nrow(WifiTrainNoNAModeling), 5000, replace = FALSE),]
inTraining <- createDataPartition(WifiTrainNoNAModeling_Sample$FLOOR, p = .75, list = FALSE)
training <- WifiTrainNoNAModeling_Sample[inTraining,]
testing <- WifiTrainNoNAModeling_Sample[-inTraining,]

In [36]:
# Cross validation within training set
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

In [37]:
# Modeling for floor with RandomForest
rfGrid <- expand.grid(mtry = c(63))
rfFitModelFLOOR <- train(FLOOR~ ., data = training, method = "rf", trControl = fitControl, tuneGrid = rfGrid)

### Prediction for Floor

In [38]:
# Predicting for floor with test set
rfPredictionmodelFLOOR <- predict(rfFitModelFLOOR, testing)

In [39]:
# Converting data type into factor for confuion matrix   
rfPredictionmodelFLOOR <- as.factor(rfPredictionmodelFLOOR)
testing$FLOOR <- as.factor(testing$FLOOR)

In [40]:
# Checking accuracy of the model
postResample(rfPredictionmodelFLOOR, testing$FLOOR)

In [41]:
# Prediction for floor with validation
rfPredictionFLOOR <- predict(rfFitModelFLOOR, WifiTestNoNA529col)

In [42]:
# Converting data type into factor for checking accuracy
rfPredictionFLOOR <- as.factor(rfPredictionFLOOR)
WifiTestNoNA529col$FLOOR <- as.factor(WifiTestNoNA529col$FLOOR)

In [43]:
# Checking accuracy of the prediction
postResample(rfPredictionFLOOR, WifiTestNoNA529col$FLOOR)

In [44]:
# Checking confusion matrix
confusionMatrix(rfPredictionFLOOR, WifiTestNoNA529col$FLOOR)

Confusion Matrix and Statistics

          Reference
Prediction   0   1   2   3   4
         0 119   9   0   0   1
         1   6 402   7   0   0
         2   6  42 285   4   0
         3   1   9  14 166   9
         4   0   0   0   2  29

Overall Statistics
                                          
               Accuracy : 0.901           
                 95% CI : (0.8819, 0.9179)
    No Information Rate : 0.4158          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8622          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity            0.9015   0.8701   0.9314   0.9651   0.7436
Specificity            0.9898   0.9800   0.9354   0.9649   0.9981
Pos Pred Value         0.9225   0.9687   0.8457   0.8342   0.9355
Neg Pred Value         0.9868   0.9138   0.9729   0.9934   0.9907
P

### Modeling for Building 

In [45]:
# Setting the data
WifiTrainNoNAModeling$FLOOR <- NULL
WifiTestNoNA529colMatch$FLOOR <- NULL
WifiTrainNoNAModeling$BUILDINGID <- as.factor(WifiTrain$BUILDINGID)
WifiTestNoNA529colMatch$BUILDINGID <- as.factor(WifiTestNoNA529col$BUILDINGID)

In [46]:
# Splitting the data into train and test set
set.seed(123)
WifiTrainNoNAModeling_Sample <- WifiTrainNoNAModeling[sample(1:nrow(WifiTrainNoNAModeling), 5000, replace = FALSE),]
inTraining <- createDataPartition(WifiTrainNoNAModeling_Sample$BUILDINGID, p = .75, list = FALSE)
training <- WifiTrainNoNAModeling_Sample[inTraining,]
testing <- WifiTrainNoNAModeling_Sample[-inTraining,]

In [47]:
# Cross validation within training set
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

In [48]:
# RandomForest for BUILDING
rfGrid <- expand.grid(mtry=c(63))
rfFitModelBUILD <- train(BUILDINGID~ ., data = training, method = "rf", trControl = fitControl, tuneGrid = rfGrid)

### Prediction for Building

In [49]:
# Prediction for Building with test set
rfPredictionmodelBUILD <- predict(rfFitModelBUILD, testing)

In [50]:
# Converting data type into factor for confusion matrix
rfPredictionmodelBUILD <- as.factor(rfPredictionmodelBUILD)
testing$BUILDINGID <- as.factor(testing$BUILDINGID)

In [51]:
# checking the accuracy of the model
postResample(pred = rfPredictionmodelBUILD, obs = testing$BUILDINGID)

In [52]:
# Prediction for building with validation set
rfPredictionBUILD <- predict(rfFitModelBUILD, WifiTestNoNA529col)

In [53]:
# Converting data type for confusion matrix 
rfPredictionBUILD <- as.factor(rfPredictionBUILD)
WifiTestNoNA529col$BUILDINGID <- as.factor(WifiTestNoNA529col$BUILDINGID)

In [54]:
# Confusion matrix
postResample(rfPredictionBUILD, WifiTestNoNA529col$BUILDINGID)
confusionMatrix(rfPredictionBUILD, WifiTestNoNA529col$BUILDINGID)

Confusion Matrix and Statistics

          Reference
Prediction   0   1   2
         0 530   1   0
         1   6 306   0
         2   0   0 268

Overall Statistics
                                          
               Accuracy : 0.9937          
                 95% CI : (0.9871, 0.9975)
    No Information Rate : 0.4824          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9901          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0 Class: 1 Class: 2
Sensitivity            0.9888   0.9967   1.0000
Specificity            0.9983   0.9925   1.0000
Pos Pred Value         0.9981   0.9808   1.0000
Neg Pred Value         0.9897   0.9987   1.0000
Prevalence             0.4824   0.2763   0.2412
Detection Rate         0.4770   0.2754   0.2412
Detection Prevalence   0.4779   0.2808   0.2412
Balanced Accuracy      0.9935   0.9946   

### Modeling for Latitude

In [55]:
# Setting the data
WifiTrainNoNAModeling$BUILDINGID <- NULL
WifiTestNoNA529colMatch$BUILDINGID <- NULL 
WifiTestNoNA529colMatch$LATITUDE <-  WifiTestNoNA529col$LATITUDE
WifiTrainNoNAModeling$LATITUDE <-  WifiTrain$LATITUDE

In [56]:
# Splitting the data into training and testing
set.seed(123)
WifiTrainNoNAModeling_Sample <- WifiTrainNoNAModeling[sample(1:nrow(WifiTrainNoNAModeling),
                                                             5000, replace = FALSE),]
inTraining <- createDataPartition(WifiTrainNoNAModeling_Sample$LATITUDE, p = .75, list = FALSE)
training <- WifiTrainNoNAModeling_Sample[inTraining,]
testing <- WifiTrainNoNAModeling_Sample[-inTraining,]

In [57]:
# Cross validation within training set
fitControl <- trainControl(method = "repeatedcv", number = 5, repeats = 1)

In [58]:
# Modeling for LATITUDE with random forest
rfGrid <- expand.grid(mtry=c(63))
rfmodelLat <- train(LATITUDE~ ., data = training, method = "rf", tuneGrid = rfGrid, trControl = fitControl)

### Prediction for Latitude

In [59]:
# Prediction for Latitude with testing set
rfPredictionModelLat <- predict(rfmodelLat, testing)

In [60]:
# Checking the accuracy with testing set
postResample(rfPredictionModelLat, testing$LATITUDE)

In [61]:
# Prediction for Latitude with validation set
rfPredictionLat <- predict(rfmodelLat, WifiTestNoNA529col)

In [62]:
# Checking the accuracy with validation set
postResample(rfPredictionLat, WifiTestNoNA529col$LATITUDE)

### Error distances

In [91]:
Hypotenuse <- hypotenuse(PredictionDataFBLL$LATITUDE - ValidationDataFBLL$LATITUDE,PredictionDataFBLL$LONGITUDE - ValidationDataFBLL$LONGITUDE)
mean(Hypotenuse)

### Exporting prediction data

In [92]:
# Creating data frame with prediction
PredictionFBLL <- data.frame(rfPredictionFLOOR, rfPredictionBUILD, rfPredictionLONGI, rfPredictionLat) 
colnames(PredictionFBLL) <- c("FLOOR", "BUIDINGID", "LONGITUDE", "LATITUDE")

In [93]:
# Exporting the prediction
write.csv(PredictionFBLL, file = "C:/Users/spxt6/OneDrive/Dokumente/R_projects/Test.Proj/WIFI/PredictionFBLL.csv", row.names = TRUE)