In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
np.set_printoptions(precision=5, suppress=True)

In [3]:
training_set_file = "../customdata/Trainingpoints.txt"

with open(training_set_file) as f:
    training_set = f.readlines()

training_set = [tuple(line.strip().split(",")) for line in training_set]
tp_to_i = {tp:i+1 for i, tp in enumerate(training_set)}
i_to_tp = {i:tp for tp, i in tp_to_i.items()}


In [12]:
import os 

# get files 
files_dir = "../customdata/TrainingSet/"
# get list of files
training_files = [file for file in os.listdir(files_dir) if file.endswith(".txt")]
training_files = sorted(training_files, key=lambda x: int(os.path.splitext(x)[0]))
# list of dfs
dfs = []
for file in training_files:
  file_path = os.path.join(files_dir, file)
  id = os.path.splitext(file)[0]
  # get coordinates of scan
  coords = np.array(list(i_to_tp[int(id)]))

  df = pd.read_csv(file_path, header=None, sep=",")
  df.drop(df.columns[-1], axis=1, inplace=True)
  df.columns = [f"AP_{i}" for i in range(1, len(df.columns)+1)]
  tiled_coords = np.tile(coords, (len(df), 1))
  df = pd.concat([df, pd.DataFrame(tiled_coords, columns=["x", "y"])], axis=1)
  dfs.append(df)

data = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

In [13]:
# denormalize rss values
min_value = -100 # from paper
max_value = -26 # manually searched for max value in dataset

# denormalize all data except last 2 columns
data.iloc[:, :-2] = data.iloc[:, :-2] * (max_value - min_value) + min_value

In [14]:
data

Unnamed: 0,AP_1,AP_2,AP_3,AP_4,AP_5,AP_6,AP_7,AP_8,AP_9,AP_10,...,AP_115,AP_116,AP_117,AP_118,AP_119,AP_120,AP_121,AP_122,x,y
0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-74.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,1173,670
1,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-74.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,1173,670
2,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-70.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,1173,670
3,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-70.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,1173,670
4,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-66.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,1173,670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4717,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-71.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-84.0,-100.0,413,1160
4718,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-76.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-84.0,-100.0,413,1160
4719,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-76.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-84.0,-100.0,413,1160
4720,-66.0,-100.0,-100.0,-100.0,-100.0,-100.0,-77.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-87.0,-100.0,413,1160


In [19]:
# replace all -100 in data with 100
data.replace(-100, 100, inplace=True)

In [21]:
scans = data.iloc[:, :-2]
coords = data.iloc[:, -2:]

In [63]:
# perform ap location estimation on custom dataset

def get_row_indices_and_signals(scans):
    
    all_ap_points, all_ap_signals = [], []
    for colname in scans.columns:
        idx = scans[colname].index[scans[colname] != 100].to_list()
        all_ap_points.append(idx)
        all_ap_signals.append(scans[colname].loc[idx].to_list())
    return all_ap_points, all_ap_signals

def get_coordinates_of_heard_scans(indices, coords):
    # get the heard ap indices
    coordinates = coords.loc[indices]
    # get the coordinates of the heard aps
    return coordinates["x"].to_numpy(), coordinates["y"].to_numpy()


def get_ap_weight(rss):
    return 100**(rss/10.0)

def approximate_ap_coordinates(x_coords, y_coords, weights):
    # sum product of weight and coordinate
    estimated_x = np.sum(x_coords * weights)
    estimated_y = np.sum(y_coords * weights)
    # get denominator
    sum_of_weights = weights.sum()
    with np.errstate(invalid='raise'):
        try:
            normalized_x = estimated_x/sum_of_weights
            normalized_y = estimated_y/sum_of_weights
        except FloatingPointError:
            print(f"estimated_x: {estimated_x}, estimated_y: {estimated_y}, sum_of_weights: {sum_of_weights}")
            print(f"x_coords: {x_coords}, y_coords: {y_coords}, weights: {weights}")
    # print(normalized_x, normalized_y)
    # return tuple of coordinates
    return estimated_x/sum_of_weights , estimated_y/sum_of_weights

def approximate_all_ap_locations(data, coords):
    # get location and signals
    all_ap_points, all_ap_signals = get_row_indices_and_signals(data)
    estimated_ap_locs = []
    # check the length of them both are the same
    assert len(all_ap_points) == len(all_ap_signals)
    print(len(all_ap_points))
    for i in range(len(all_ap_points)):
        # get ap indices and rss strengths
        idx, signals = all_ap_points[i], all_ap_signals[i]
        # get coordinates of the scans
        xs, ys = get_coordinates_of_heard_scans(idx, coords)
        # get weights
        weights = np.asarray([get_ap_weight(signal) for signal in signals])
        # get estimated coordinates
        estimated_coords = approximate_ap_coordinates(xs, ys, weights)
        estimated_ap_locs.append(list(estimated_coords))
    return np.asarray(estimated_ap_locs)

estimated_ap_locs = approximate_all_ap_locations(scans, coords)
# print(len(estimated_ap_locs))
# perform ap location estimation on custom dataset
# estimated_ap_locs = approximate_all_ap_locations(scans, coords)

122
estimated_x: 0.0, estimated_y: 0.0, sum_of_weights: 0.0
x_coords: [], y_coords: [], weights: []
estimated_x: 0.0, estimated_y: 0.0, sum_of_weights: 0.0
x_coords: [], y_coords: [], weights: []


  return estimated_x/sum_of_weights , estimated_y/sum_of_weights
  return estimated_x/sum_of_weights , estimated_y/sum_of_weights


In [66]:
# get indices of nans
nan_indices = np.argwhere(np.isnan(estimated_ap_locs)) 

In [72]:
# drop nans from estimated_ap_locs
estimated_ap_locs = estimated_ap_locs[~np.isnan(estimated_ap_locs).any(axis=1)]

In [None]:
# for indices 29 and 85
import numpy as np
from sklearn.mixture import GaussianMixture

# Assuming 'estimated_ap_locs' is your numpy array with shape (122, 2)
gmm = GaussianMixture(n_components=2)
gmm.fit(estimated_ap_locs)



In [81]:
new_samples

array([[ 756.90913,  893.84983],
       [1263.03752, 1004.05223]])

In [82]:
# insert new samples at index 29 and 85 in estimated_ap_locs
estimated_ap_locs = np.insert(estimated_ap_locs, 29, new_samples[0], axis=0)
estimated_ap_locs = np.insert(estimated_ap_locs, 85, new_samples[1], axis=0)

In [86]:
# put estimated_ap_locs in a dataframe and save it

save_loc = "../data/raw/"
dataset_name = "UniversityTrain"
ap_coords_df = pd.DataFrame(estimated_ap_locs, columns=["x", "y"])

# save ap_coords_df
ap_coords_df.to_csv(save_loc + dataset_name + "_ap_coords.csv", index=False)

In [112]:
# augment coords and scans using threshold dropping and random dropping
from tqdm import tqdm
num_aps  = 122
# augment dataset with more data
def threshold_drop(input, threshold=-90):
    cand_indices = []
    cand_signals = []
    for idx, signal in enumerate(input):
        if signal < threshold and signal != 100:
            cand_indices.append(idx)
            cand_signals.append(signal)
    
    # get random combintion of random length to drop 
    rand_nums = []
    for i in range(0, 3):
        if len(cand_indices) > 0:
            rand_nums.append(random.randint(1, len(cand_indices)))

    # for each random length, get a random combination of indices to drop
    augmented_data = []
    for rand_num in rand_nums:
        drop_indices = random.sample(cand_indices, rand_num)
        # make copy of input
        copy = np.copy(input)
        # drop indices
        copy[drop_indices] = 100
        # append to augmented data
        augmented_data.append(copy)
    
    return augmented_data

def random_drop(input):
    # get several random binary masks of same length as input
    masks = []
    for i in range(0, 3):
        masks.append(np.random.randint(2, size=len(input)))
    
    # for each mask, drop the indices
    augmented_data = []
    for mask in masks:
        copy = np.copy(input)
        copy[mask == 1] = 100
        augmented_data.append(copy)
    
    return augmented_data

def augment_dataset(dataset, y, threshold=-90):

    # check if dataset is pandas dataframe
    if isinstance(dataset, pd.DataFrame):
        # change to numpy
        dataset = dataset.to_numpy()
    # check if y is pandas dataframe or series
    if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
        # change to numpy
        y = y.to_numpy()
    
    new_x = []
    new_y = []
    # create tqdm loop
    loop = tqdm(dataset, total=len(dataset), leave=False)
    for idx, scan in enumerate(loop):
        # augment scan using  threshold dropper
        augmented_thresh = threshold_drop(scan, threshold=threshold)
        # augment scan using random dropper
        augmented_random = random_drop(scan)
        # append to new_x
        new_x.append(scan)
        new_x.extend(augmented_thresh)
        new_x.extend(augmented_random)
        # append to new_y
        new_y.append(y[idx])
        new_y.extend([y[idx]] * len(augmented_thresh))
        new_y.extend([y[idx]] * len(augmented_random))
    
    return np.asarray(new_x), np.asarray(new_y)

In [None]:
# plot distribution of values that are not 100 in scans dataframe
import matplotlib.pyplot as plt
# Define the certain value



# Plot the distribution
plt.figure(figsize=(8, 6))
scans.plot(kind='hist', alpha=0.7, bins=50, edgecolor='black')
plt.title("Distribution of Values not equal to {}".format(certain_value))
plt.xlabel("Value")
# set xlimit to be from -100 to 0
plt.xlim(-100, 0)
plt.ylabel("Frequency")
plt.show()

In [155]:
augmented_x, augmented_y = augment_dataset(scans, coords, -90)

                                                      

In [156]:
# create new dataframe from augmented x and y and drop druplicates but keep first
augmented_df = pd.concat([pd.DataFrame(augmented_x, columns=[f"AP_{i}" for i in range(1, num_aps+1)]), pd.DataFrame(augmented_y, columns=["x", "y"])], axis=1)

In [157]:
# drop duplicates in augmented dataframe but keep first
augmented_df = augmented_df.drop_duplicates(keep='first').reset_index(drop=True)

In [159]:
# split into train and validation splits, 29 data points in training set, remove 5 for validation

# # randomly sample 5 points from training set 
# random.seed(42)
# validation_set = random.sample(training_set, 5)
# # change strings to ints
# validation_set = [(int(a), int(b)) for a, b in validation_set]
# validation_points = [list(a) for a in validation_set]
# print(validation_set)
# extract the data from the augmented dataset 

# validation_df = pd.DataFrame(validation_points, columns=["x", "y"])


# # get validation_data 
# validation_df = pd.merge(augmented_df, validation_df, on=["x", "y"], how="inner")

# # filter the data from the augmented dataset # augmented_df = 
# train_df = augmented_df[~augmented_df.index.isin(validation_df.index)].reset_index(drop=True)

[(269, 851), (1770, 679), (1173, 670), (2991, 356), (2780, 1192)]


In [174]:
# split 80/20 train/test
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(augmented_df.iloc[:, :-2], augmented_df.iloc[:, -2:], test_size=0.2, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
validation_df = pd.concat([X_valid, y_valid], axis=1)


In [175]:
# divide x and y columns in train and validation df by 100 after changing to float

train_df['x'] = train_df['x'].astype(float)/100
train_df['y'] = train_df['y'].astype(float)/100
validation_df['x'] = validation_df['x'].astype(float)/100
validation_df['y'] = validation_df['y'].astype(float)/100

In [176]:
# load estimated ap locs and divide x and y by 100 after making them floats
save_loc = "../data/raw/"
train_name = "UniversityTrain"
valid_name = "UniversityValid"
ap_coords_df = pd.DataFrame(estimated_ap_locs, columns=["x", "y"])
ap_coords_df["x"] = ap_coords_df["x"].astype(float) / 100
ap_coords_df["y"] = ap_coords_df["y"].astype(float) / 100
# save ap_coords_df
ap_coords_df.to_csv(save_loc + train_name + "_ap_coords.csv", index=False)
ap_coords_df.to_csv(save_loc + valid_name + "_ap_coords.csv", index=False)

In [177]:
# save scans and scan coords 

validation_df.iloc[:, :-2].to_csv(save_loc+valid_name+"_scans.csv", index=False)
validation_df.iloc[:, -2:].to_csv(save_loc+valid_name+"_scan_coords.csv", index=False)

train_df.iloc[:, :-2].to_csv(save_loc+train_name+"_scans.csv", index=False)
train_df.iloc[:, -2:].to_csv(save_loc+train_name+"_scan_coords.csv", index=False)

In [218]:
envs_dir = "../customdata/envs2-3/"

env2_scans = pd.read_csv(envs_dir+"env2.csv")
env2_scans = env2_scans.rename(columns={"X": "x", "Y": "y"})

In [185]:
env2_scans, env2_coords = env2_scans.iloc[:, :-2], env2_scans.iloc[:, -2:]

In [186]:
env2_ap_coords = approximate_all_ap_locations(env2_scans, env2_coords)

9


In [191]:
# make it into dataframe
env2_estimated_ap_coords = pd.DataFrame(env2_ap_coords, columns=['x', 'y'])

env2_estimated_ap_coords /=100

# save to csv 
save_loc = "../data/raw/"
name = "env2"

env2_estimated_ap_coords.to_csv(os.path.join(save_loc, f"{name}_ap_coords.csv"), index=False)


In [211]:
# augment scans and coords
augmented_env2x, augmented_env2y = augment_dataset(env2_scans, env2_coords, -90)

  0%|          | 0/14122 [00:00<?, ?it/s]

                                                        

In [212]:
augmented_env2 = pd.concat([pd.DataFrame(augmented_env2x, columns=env2_scans.columns), pd.DataFrame(augmented_env2y, columns=env2_coords.columns)], axis=1)

In [213]:
# drop duplicates but keep first 
augmented_env2 = augmented_env2.drop_duplicates(keep='first').reset_index(drop=True)

In [214]:
augmented_env2["x"] = augmented_env2["x"] /100.0
augmented_env2["y"] = augmented_env2["y"] /100.0

In [215]:
# split augmented_env2 into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(augmented_env2.iloc[:, :-2], augmented_env2.iloc[:, -2:], test_size=0.2, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
validation_df = pd.concat([X_valid, y_valid], axis=1)


In [216]:
# save train and validation dfs

train_name = "env2train"
val_name = "env2val"

train_df.iloc[:, :-2].to_csv(os.path.join(save_loc, f"{train_name}_scans.csv"), index=False)
train_df.iloc[:, -2:].to_csv(os.path.join(save_loc, f"{train_name}_scan_coords.csv"), index=False)

validation_df.iloc[:, :-2].to_csv(os.path.join(save_loc, f"{val_name}_scans.csv"), index=False)
validation_df.iloc[:, -2:].to_csv(os.path.join(save_loc, f"{val_name}_scan_coords.csv"), index=False)

In [None]:
# do the same with envrionment 3
env3_scans = pd.read_csv(envs_dir+"env3.csv")
env3_scans = env3_scans.rename(columns={"X": "x", "Y": "y"})

def augment_data(df, name, save_loc="../data/raw/", threshold=-90, divide=True, augment=False):
    df_scans, df_coords = df.iloc[:, :-2], df.iloc[:, -2:]

    if augment:
        augmented_dfx, augmented_dfy = augment_dataset(df_scans, df_coords, threshold)

        # make them into dataframes and concat
        augmented_df = pd.concat([pd.DataFrame(augmented_dfx, columns=df_scans.columns), pd.DataFrame(augmented_dfy, columns=df_coords.columns)], axis=1)

        # drop duplicates
        augmented_df = augmented_df.drop_duplicates(keep='first').reset_index(drop=True)
    else:
        augmented_df = pd.concat([df_scans, df_coords], axis=1)
    # divide x and y by 100 to turn to meters
    if divide:
        augmented_df.x = augmented_df.x.astype(float)
        augmented_df.y = augmented_df.y.astype(float)
        augmented_df.x /= 100.0
        augmented_df.y /= 100.0
    
    # split to train and test
    train_df, test_df = train_test_split(augmented_df, test_size=0.2, random_state=42)

    # save to csv
    train_name = name + "train"
    val_name = name + "val"

    train_df.iloc[:, :-2].to_csv(os.path.join(save_loc, f"{train_name}_scans.csv"), index=False)
    train_df.iloc[:, -2:].to_csv(os.path.join(save_loc, f"{train_name}_scan_coords.csv"), index=False)

    test_df.iloc[:, :-2].to_csv(os.path.join(save_loc, f"{val_name}_scans.csv"), index=False)
    test_df.iloc[:, -2:].to_csv(os.path.join(save_loc, f"{val_name}_scan_coords.csv"), index=False)

    return train_df, test_df

augment_data(env3_scans, "env3")
augment_data(env2_scans, "env2")


In [223]:
test = augmented_y /100.0

In [226]:
from sklearn.preprocessing import MinMaxScaler
test = augmented_y /100.0
# make test a dataframe
test = pd.DataFrame(test, columns=["x", "y"])
scaler = MinMaxScaler()
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)
print(test)

              x         y
0      0.284994  0.346623
1      0.284994  0.346623
2      0.284994  0.346623
3      0.284994  0.346623
4      0.284994  0.346623
...         ...       ...
20698  0.045397  0.745321
20699  0.045397  0.745321
20700  0.045397  0.745321
20701  0.045397  0.745321
20702  0.045397  0.745321

[20703 rows x 2 columns]


In [230]:
scaler.data_min_

array([2.69, 2.44])

In [234]:
test2 = pd.DataFrame(augmented_x)
overallmin = test2.min().min()
overallmax = test2[test2 != 100].max().max()
print(overallmin)
print(overallmax)

-99.0
-9.0
