In [2]:
import sys
sys.path.append("../scripts")

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from model import GCN
from dataset import Dataset
from train import load_dataset
import pandas as pd
import numpy as np

In [4]:
# prepare datasets for training and testing GNN model

# we will train on one floor from building 0 and test on another floor from building 0
import os
ujindoor_dir = "../../ujiindoorloc/UJIndoorLoc/building_ids_csvs/"
building_id = 0
train_floor = 0
test_floor = 1

train_dir = os.path.join(ujindoor_dir, str(building_id), "building_" + str(building_id)+ "_floor_"+ str(train_floor))
train_df = pd.read_csv(train_dir + ".csv")

test_dir = os.path.join(ujindoor_dir, str(building_id), "building_" + str(building_id)+ "_floor_"+ str(test_floor))
test_df = pd.read_csv(test_dir + ".csv")


In [5]:
from tqdm import tqdm

In [6]:
# augment dataset with more data
import random
def threshold_drop(input, threshold=-90):
    cand_indices = []
    cand_signals = []
    for idx, signal in enumerate(input):
        if signal < threshold and signal != 100:
            cand_indices.append(idx)
            cand_signals.append(signal)
    
    # get random combintion of random length to drop 
    rand_nums = []
    for i in range(0, 5):
        if len(cand_indices) > 0:
            rand_nums.append(random.randint(1, len(cand_indices)))

    # for each random length, get a random combination of indices to drop
    augmented_data = []
    for rand_num in rand_nums:
        drop_indices = random.sample(cand_indices, rand_num)
        # make copy of input
        copy = np.copy(input)
        # drop indices
        copy[drop_indices] = 100
        # append to augmented data
        augmented_data.append(copy)
    
    return augmented_data

def random_drop(input):
    # get several random binary masks of same length as input
    masks = []
    for i in range(0, 5):
        masks.append(np.random.randint(2, size=len(input)))
    
    # for each mask, drop the indices
    augmented_data = []
    for mask in masks:
        copy = np.copy(input)
        copy[mask == 1] = 100
        augmented_data.append(copy)
    
    return augmented_data

def augment_dataset(dataset, y, threshold=-90):

    # check if dataset is pandas dataframe
    if isinstance(dataset, pd.DataFrame):
        # change to numpy
        dataset = dataset.to_numpy()
    # check if y is pandas dataframe or series
    if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
        # change to numpy
        y = y.to_numpy()
    
    new_x = []
    new_y = []
    # create tqdm loop
    loop = tqdm(dataset, total=len(dataset), leave=False)
    for idx, scan in enumerate(loop):
        # augment scan using  threshold dropper
        augmented_thresh = threshold_drop(scan, threshold=threshold)
        # augment scan using random dropper
        augmented_random = random_drop(scan)
        # append to new_x
        new_x.append(scan)
        new_x.extend(augmented_thresh)
        new_x.extend(augmented_random)
        # append to new_y
        new_y.append(y[idx])
        new_y.extend([y[idx]] * len(augmented_thresh))
        new_y.extend([y[idx]] * len(augmented_random))
    
    return np.asarray(new_x), np.asarray(new_y)

In [7]:
# augment train_df 
# get train_df x
num_aps = 520

train_df_x = train_df.iloc[:, :num_aps]
train_df_y = train_df.iloc[:, 520:522]

In [8]:
# augment data
augmented_x, augmented_y = augment_dataset(train_df_x, train_df_y, -90)

                                                    

In [9]:
assert len(augmented_x) == len(augmented_y)

In [99]:
# create dataframe 
augmented_df = pd.DataFrame(augmented_x, columns = ["AP_{}".format(i) for i in range(0, 520)])
augmented_df["LONG"] = augmented_y[:, 0]
augmented_df["LAT"] = augmented_y[:, 1]
# drop duplicates, keep first and reset index
augmented_df = augmented_df.drop_duplicates(keep="first").reset_index(drop=True)

In [100]:
augmented_df

Unnamed: 0,AP_0,AP_1,AP_2,AP_3,AP_4,AP_5,AP_6,AP_7,AP_8,AP_9,...,AP_512,AP_513,AP_514,AP_515,AP_516,AP_517,AP_518,AP_519,LONG,LAT
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7632.1436,4.864982e+06
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7768,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
7769,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
7770,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06
7771,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.2570,4.864950e+06


In [106]:
# save dataframe to csv
save_dir_scans = "../data/raw/ujindoorloc_building_0_floor_0_scans.csv"
save_dir_scan_coords = "../data/raw/ujindoorloc_building_0_floor_0_scan_coords.csv"
augmented_df.iloc[:, :520].to_csv(save_dir_scans, index=False)
augmented_df.iloc[:, 520:].to_csv(save_dir_scan_coords, index=False)

In [113]:
augmented_df.head()

Unnamed: 0,AP_0,AP_1,AP_2,AP_3,AP_4,AP_5,AP_6,AP_7,AP_8,AP_9,...,AP_512,AP_513,AP_514,AP_515,AP_516,AP_517,AP_518,AP_519,LONG,LAT
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7632.1436,4864982.0
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.257,4864950.0
2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.257,4864950.0
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.257,4864950.0
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,-7637.257,4864950.0


In [20]:
test_df

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,-83,100,100,100,...,100,-7638.8080,4.864907e+06,1,0,211,2,11,13,1369927675
1,100,100,100,100,100,100,-88,100,100,100,...,100,-7638.2760,4.864905e+06,1,0,210,2,11,13,1369927645
2,100,100,100,100,100,100,-92,100,100,100,...,100,-7636.2976,4.864898e+06,1,0,209,2,11,13,1369927610
3,100,100,100,100,100,100,-75,100,100,100,...,100,-7637.0024,4.864906e+06,1,0,208,2,11,13,1369927712
4,100,100,100,100,100,100,-71,100,100,100,...,100,-7639.6910,4.864915e+06,1,0,205,2,11,13,1369927802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351,100,100,100,100,100,100,100,100,100,100,...,100,-7633.0415,4.864971e+06,1,0,130,2,1,14,1371049541
1352,100,100,100,100,100,100,100,100,100,100,...,100,-7633.2056,4.864965e+06,1,0,128,2,1,14,1371049516
1353,100,100,100,100,100,100,100,100,-88,100,...,100,-7635.4526,4.864966e+06,1,0,129,2,1,14,1371049486
1354,100,100,100,100,100,100,100,100,-72,100,...,100,-7643.0924,4.864948e+06,1,0,228,2,1,14,1371049427


In [107]:
test_floor = 3
test_dir = os.path.join(ujindoor_dir, str(building_id), "building_" + str(building_id)+ "_floor_"+ str(test_floor))
test_df = pd.read_csv(test_dir + ".csv")


In [109]:
new_column_names = {col: f'AP_{i}' for i, col in enumerate(test_df.columns[:520])}
test_df.rename(columns=new_column_names, inplace=True)

In [114]:
test_df_x = test_df.iloc[:, 0:520]
test_df_y = test_df.iloc[:, 520:522]
test_df_y.columns = ["LONG", "LAT"]

In [115]:
save_dir_scans = "../data/raw/ujindoorloc_building_0_floor_3_scans.csv"
save_dir_coords = "../data/raw/ujindoorloc_building_0_floor_3_scan_coords.csv"
test_df_x.to_csv(save_dir_scans, index=False)
test_df_y.to_csv(save_dir_coords, index=False)

In [26]:
# define function to load data, augment it if needed and save it
def process_dataset(dataset_save_dir, building_id, floor_id, final_save_dir, augment=False):

    # check if dataset save dir exists
    if not os.path.exists(dataset_save_dir):
        return "Folder doesnt exist" # dataset save dir does not exist
    
    # get building and floor dataset save dir
    building_floor_dataset_save_dir = os.path.join(dataset_save_dir, str(building_id), "building_" + str(building_id)+ "_floor_"+ str(floor_id) + ".csv")

    # check save dir exists
    if not os.path.exists(building_floor_dataset_save_dir):
        return "File doesnt exist"
    
    # read file 
    df = pd.read_csv(building_floor_dataset_save_dir)

    train_df_x = df.iloc[:, 0:520]
    train_df_y = df.iloc[:, 520:522]
    if augment:
        train_df_x, train_df_y = augment_dataset(train_df_x, train_df_y)

    if not augment:
        train_df_y = train_df_y.to_numpy()
        train_df_x = train_df_x.to_numpy()
    # create augmented dataframe
    augmented_df = pd.DataFrame(train_df_x, columns=["AP_{}".format(i) for i in range(520)])
    
    augmented_df["LONG"] = train_df_y[:, 0]
    augmented_df["LAT"] = train_df_y[:, 1]

    # drop duplicates
    if augment:
        augmented_df = augmented_df.drop_duplicates(keep="first").reset_index(drop=True)
    
    # save augmented dataframe 
    file_name_scans = f"ujindoorloc_building_{building_id}_floor_{floor_id}_scans.csv"
    file_name_scan_coords = f"ujindoorloc_building_{building_id}_floor_{floor_id}_scan_coords.csv"
    save_dir_scans = os.path.join(final_save_dir, file_name_scans)
    save_dir_scan_coords = os.path.join(final_save_dir, file_name_scan_coords)
    
    augmented_df.iloc[:, 0:520].to_csv(save_dir_scans, index=False)
    augmented_df.iloc[:, 520:522].to_csv(save_dir_scan_coords, index=False)

    return "Success"

In [12]:
dataset_dir = "../../ujiindoorloc/UJIndoorLoc/building_ids_csvs"
save_dir = "../data/raw"
train_floor = 1
building = 0
augment = True
# process_dataset(dataset_dir, building, train_floor, save_dir, augment=augment)

In [130]:
train_floor = 0
building = 1
augment=True
process_dataset(dataset_dir, building, train_floor, save_dir, augment=augment)

  0%|          | 0/1368 [00:00<?, ?it/s]

                                                    

'Success'

In [131]:
process_dataset(dataset_dir, 2, train_floor, save_dir, augment=augment)

                                                     

'Success'

In [27]:
process_dataset(dataset_dir, 1, 3, save_dir, augment=False)

'Success'

In [28]:
process_dataset(dataset_dir, 2, 3, save_dir, augment=False)

'Success'

In [29]:
process_dataset(dataset_dir, 0, 2, save_dir, augment=False)

'Success'

In [30]:
process_dataset(dataset_dir, 1, 2, save_dir, augment=False)

'Success'

In [31]:
process_dataset(dataset_dir, 2, 2, save_dir, augment=False)

'Success'

In [32]:
process_dataset(dataset_dir, 1, 1, save_dir, augment=False)

'Success'

In [33]:
process_dataset(dataset_dir, 2, 1, save_dir, augment=False)

'Success'

In [34]:
process_dataset(dataset_dir, 0, 1, save_dir, augment=False)

'Success'

In [35]:
process_dataset(dataset_dir, 2, 4, save_dir, augment=False)

'Success'

In [36]:
process_dataset(dataset_dir, 0, 0, save_dir, augment=False)


'Success'

In [37]:
process_dataset(dataset_dir, 1, 0, save_dir, augment=False)


'Success'

In [38]:
process_dataset(dataset_dir, 2, 0, save_dir, augment=False)


'Success'